def hdfs(): hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') yield hdfs if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test')
def hdfs(): hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') yield hdfs if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test')
def hdfs(): hdfs = HDFileSystem(host="localhost", port=8020) if hdfs.exists("/tmp/test"): hdfs.rm("/tmp/test") hdfs.mkdir("/tmp/test") yield hdfs if hdfs.exists("/tmp/test"): hdfs.rm("/tmp/test")
def make_hdfs(): hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') try: yield hdfs finally: if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test')
def make_hdfs(): hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') try: yield hdfs finally: if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test')
def hdfs(): hdfs = HDFileSystem(host='localhost', port=8020, pars={'rpc.client.connect.retry': '2'}) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') yield hdfs if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.disconnect()
def hdfs(): hdfs = HDFileSystem(host=test_host, port=test_port, pars={'rpc.client.connect.retry': '2'}) if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test') hdfs.mkdir('/tmp/test') yield hdfs if hdfs.exists('/tmp/test'): hdfs.rm('/tmp/test', recursive=True) hdfs.disconnect()
def make_hdfs(): from hdfs3 import HDFileSystem # from .hdfs import DaskHDFileSystem basedir = '/tmp/test-distributed' hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists(basedir): hdfs.rm(basedir) hdfs.mkdir(basedir) try: yield hdfs, basedir finally: if hdfs.exists(basedir): hdfs.rm(basedir)
def make_hdfs(): from hdfs3 import HDFileSystem # from .hdfs import DaskHDFileSystem basedir = '/tmp/test-distributed' hdfs = HDFileSystem(host='localhost', port=8020) if hdfs.exists(basedir): hdfs.rm(basedir) hdfs.mkdir(basedir) try: yield hdfs, basedir finally: if hdfs.exists(basedir): hdfs.rm(basedir)
def _main_(self): # client = Client(url, root=None, proxy=None, timeout=None, session=None) # client = Client("http://hadoop:50070") # client = InsecureClient("http://10.0.75.1:50070/", user='******') # # client = InsecureClient("http://120.78.186.82:50070", user='******'); hdfs = HDFileSystem(host="10.0.75.1", port='8020') path = "/data/" hdfs.cancel_token(token=None) # 未知,求大佬指点 hdfs.cat(path) # 获取指定目录或文件的内容 print(hdfs) print(hdfs.exists(path)) # hdfs.chmod(path, mode) # 修改制定目录的操作权限 # hdfs.chown(path, owner, group) # 修改目录所有者,以及用户组 # hdfs.concat(destination, # paths) # 将指定多个路径paths的文件,合并成一个文件写入到destination的路径,并删除源文件(The source files are deleted on successful completion.成功完成后将删除源文件。) # hdfs.connect() # 连接到名称节点 这在启动时自动发生。 LZ:未知作用,按字面意思,应该是第一步HDFileSystem(host='127.0.0.1', port=8020)发生的 # hdfs.delegate_token(user=None) # hdfs.df() # HDFS系统上使用/空闲的磁盘空间 # hdfs.disconnect() # 跟connect()相反,断开连接 # hdfs.du(path, total=False, deep=False) # 查看指定目录的文件大小,total是否把大小加起来一个总数,deep是否递归到子目录 # hdfs.exists(path) # 路径是否存在 # hdfs.get(hdfs_path, local_path, blocksize=65536) # 将HDFS文件复制到本地,blocksize设置一次读取的大小 # hdfs.get_block_locations(path, start=0, length=0) # 获取块的物理位置 # hdfs.getmerge(path, filename, blocksize=65536) # 获取制定目录下的所有文件,复制合并到本地文件 # hdfs.glob(path) # /user/spark/abc-*.txt 获取与这个路径相匹配的路径列表 # hdfs.head(path, size=1024) # 获取指定路径下的文件头部分的数据 # hdfs.info(path) # 获取指定路径文件的信息 # hdfs.isdir(path) # 判断指定路径是否是一个文件夹 # hdfs.isfile(path) # 判断指定路径是否是一个文件 # hdfs.list_encryption_zones() # 获取所有加密区域的列表 # hdfs.ls(path, detail=False) # 返回指定路径下的文件路径,detail文件详细信息 # hdfs.makedirs(path, mode=457) # 创建文件目录类似 mkdir -p # hdfs.mkdir(path) # 创建文件目录 # hdfs.mv(path1, path2) # 将path1移动到path2 # open(path, mode='rb', replication=0, buff=0, block_size=0) # 读取文件,类似于python的文件读取 # hdfs.put(filename, path, chunk=65536, replication=0, block_size=0) # 将本地的文件上传到,HDFS指定目录 # hdfs.read_block(fn, offset, length, # delimiter=None) # 指定路径文件的offset指定读取字节的起始点,length读取长度,delimiter确保读取在分隔符bytestring上开始和停止 # hdfs.read_block('/data/file.csv', 0, 13) # hdfs.read_block('/data/file.csv', 0, 13, delimiter=b'\n') # hdfs.rm(path, recursive=True) # 删除指定路径recursive是否递归删除 # hdfs.tail(path, size=1024) # 获取 文件最后一部分的数据 # hdfs.touch(path) # 创建一个空文件 # hdfs.walk(path) # 遍历文件树 # print(client) # hdfs.put_to_hdfs(client, 'F:\\Maven\\work\\CDES\\code\\LogDemo', '/data')
def insert_csv(self, table_name, name_node_url, name_node_options, csv_file, df_flag, autolimit, displaylimit=100): """ Enables insertion of CSVs or DataFrames to Hive. """ if self.cluster_details: if 'name_node_url' in self.cluster_details: name_node_url = self.cluster_details['name_node_url'] if 'name_node_opts' in self.cluster_details: name_node_options = self.cluster_details['name_node_opts'] csv_file_name = csv_file.split('/')[-1] folder_name = csv_file_name.split('.')[0] hdfs_location = '/user/{}/notebooks/{}'.format(getpass.getuser(), folder_name) hdfs = HDFileSystem(host=name_node_url, pars=name_node_options) if not hdfs.exists(hdfs_location): hdfs.mkdir(hdfs_location) hdfs_file_location = '{}/{}'.format(hdfs_location, csv_file_name) hdfs.put(csv_file, hdfs_file_location) data_type = HiveConnection.csv_datatypes(csv_file) data_type_list = ',\n'.join( ['%s %s' % (key, value) for (key, value) in data_type.items()]) create_table_command = "CREATE EXTERNAL TABLE {} ({})" \ "\nROW FORMAT DELIMITED\nFIELDS TERMINATED BY " \ "','\nSTORED AS TEXTFILE\nLOCATION '{}'\n" \ "tblproperties(" \ "\"skip.header.line.count\"=\"1\");"\ .format(table_name, data_type_list, hdfs_location) drop_table_command = 'DROP TABLE IF EXISTS {};'.format(table_name) command = drop_table_command + '\n' + create_table_command if df_flag: os.system("rm %s" % csv_file) return self.execute(command, autolimit, displaylimit)
''' Created on Jun 10, 2017 @author: SathishParthasarathy ''' from pyspark import SparkConf, SparkContext from hdfs3 import HDFileSystem if __name__ == '__main__': conf = SparkConf().setAppName("Word Count - Python") spark = SparkContext(conf=conf) hdfs = HDFileSystem('hadoop.master.com', port=9000) if hdfs.exists("/user/psathishcs/Output/Books/Science_Python") != True: text_file = spark.textFile( "hdfs://hadoop.master.com:9000/user/psathishcs/Input/Books/The_Outline_of_Science.txt" ) words = text_file.flatMap(lambda line: line.split()) wordCounts = words.map(lambda word: (word, 1)).reduceByKey( lambda a, b: a + b) wordCounts.saveAsTextFile( "hdfs://hadoop.master.com:9000/user/psathishcs/Output/Books/Science_Python" )
if __name__ == "__main__": todayStr1 = datetime.strftime(datetime.now(), '%Y%m%d') todayStr2 = datetime.strftime(datetime.now(), '%Y-%m-%d') hdfs = HDFileSystem(host='10.0.1.218', port=9000) # df_org_features = org_features(df_org, df_holidayList, df_holidayTOdate) df_org, df_holidayList, df_holidayTOdate = spark_readData() print('read data is OK!!!') df_orgFeatures = org_features(df_org, df_holidayList, df_holidayTOdate) print('df_orgFeatures is OK!!!') df_buildFeatures = build_features(df_orgFeatures) print('df_buildFeatures is OK!!!') lr_historyData = lr_history_data(df_buildFeatures) print('lr_historyData is OK!!!') train_data, val_data = train_val_data(lr_historyData) print("train_data and val_data is OK!!!") # mseError_DF = lr_trainModel(train_data, val_data) # mseError_DF.show() # print('LR_Model is OK!!!') org_predictData = org_predictData(df_buildFeatures, df_holidayList) print("org_predictData is OK!!!") lr_predictData = lr_predictData(org_predictData) if hdfs.exists('hdfs://10.0.1.218:9000/predict-2019/predict_data_20190624.parquet'): hdfs.rm('hdfs://10.0.1.218:9000/predict-2019/predict_data_20190624.parquet') lr_predictData.write.format('parquet').save('hdfs://10.0.1.218:9000/predict-2019/predict_data_20190624.parquet') print("lr_predictData is OK!!!")
#!/bin/env python # -*- coding: utf-8 -*- import os.path from hdfs3 import HDFileSystem import config print 'NameNode host:', config.NAMENODE_HOST print 'NameNode port:', config.NAMENODE_PORT client = HDFileSystem(host=config.NAMENODE_HOST, port=config.NAMENODE_PORT) remote_dir = os.path.dirname(config.RFILE_FMT) if not client.exists(remote_dir): client.mkdir(remote_dir) for day in range(0, config.DAYS): src = "".join([config.LFILE_FMT, str(day)]) dst = "".join([config.RFILE_FMT, str(day)]) if not os.path.exists(src): print 'Skipping:', src, 'file not found!' continue if client.exists(dst): print 'Skipping:', dst, 'file already exists: hadoop fs -rm', dst continue print 'Uploading', src, '=>', dst client.put(src, dst)
class KafkaConsumer(object): def __init__(self, topics): # self.consumer = KafkaConsumer( # topics, # bootstrap_servers=['localhost:9092'], # auto_offset_reset='earliest', # enable_auto_commit=True, # group_id='my-group', # value_deserializer=lambda x: loads(x.decode('utf-8'))) # self.HDFS = pa.HDFS.connect(host='localhost', port=9000) self.hdfs = HDFileSystem(host='localhost', port=9000) self.destination_path = "" self.dest_path_tweet = '/user/BigData/tweet_data' self.dest_path_rss = '/user/BigData/rss_data' self.dest_path_corona = '/user/BigData/corona_data' self.temp_path = 'temp_data' def write_to_file(self, source_df, data_type="tweet"): """ Write collected data to HDFS. If file is collected on the same day, method will retrieve the previous dataframe combine it and write back to HDFS. Ref: https://hdfs3.readthedocs.io/en/latest/ :param source_df: Dataframe to be written :param data_type: "tweet", "rss", "corona" only these 3 :return: """ try: if len(source_df) > 0: if "tweet" in str(data_type).lower(): self.destination_path = self.dest_path_tweet elif "rss" in str(data_type).lower(): self.destination_path = self.dest_path_rss elif "corona" in str(data_type).lower(): self.destination_path = self.dest_path_corona else: raise Exception("Invalid data type, unsure where to storage in HDFS.") # write to temp storage file_name = 'temp.csv' temp_path = os.path.join(self.temp_path, file_name) source_df.to_csv(temp_path) # check see if existing csv if yes combine them date_str = datetime.now().strftime("%d-%m-%Y") file_name = 'tweets_{0}.csv'.format(date_str) hdfs_path = os.path.join(self.destination_path, file_name) if self.hdfs.exists(hdfs_path): with self.hdfs.open(hdfs_path) as f: exist_df = pd.read_csv(f) source_df = pd.concat([source_df, exist_df]) self.hdfs.rm(hdfs_path) # remove and write a new one # pushing to HDFS self.hdfs.put(temp_path, hdfs_path) print("Write to HDFS completed: ", source_df.shape) except Exception as e: print(str(e))
class HadoopFileSystem(FileSystem): """``FileSystem`` implementation that supports HDFS. URL arguments to methods expect strings starting with ``hdfs://``. Uses client library :class:`hdfs3.core.HDFileSystem`. """ def __init__(self): """Initializes a connection to HDFS. Connection configuration is done using :doc:`hdfs`. """ super(HadoopFileSystem, self).__init__() self._hdfs_client = HDFileSystem() @classmethod def scheme(cls): return 'hdfs' @staticmethod def _parse_url(url): """Verifies that url begins with hdfs:// prefix, strips it and adds a leading /. Raises: ValueError if url doesn't begin with hdfs://. Args: url: A URL in the form hdfs://path/... Returns: For an input of 'hdfs://path/...', will return '/path/...'. """ m = _URL_RE.match(url) if m is None: raise ValueError('Could not parse url: %s' % url) return m.group(1) def join(self, base_url, *paths): """Join two or more pathname components. Args: base_url: string path of the first component of the path. Must start with hdfs://. paths: path components to be added Returns: Full url after combining all the passed components. """ basepath = self._parse_url(base_url) return _HDFS_PREFIX + self._join(basepath, *paths) def _join(self, basepath, *paths): return posixpath.join(basepath, *paths) def split(self, url): rel_path = self._parse_url(url) head, tail = posixpath.split(rel_path) return _HDFS_PREFIX + head, tail def mkdirs(self, url): path = self._parse_url(url) if self._exists(path): raise IOError('Path already exists: %s' % path) return self._mkdirs(path) def _mkdirs(self, path): self._hdfs_client.makedirs(path) def match(self, url_patterns, limits=None): if limits is None: limits = [None] * len(url_patterns) if len(url_patterns) != len(limits): raise BeamIOError( 'Patterns and limits should be equal in length: %d != %d' % ( len(url_patterns), len(limits))) # TODO(udim): Update client to allow batched results. def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit] metadata_list = [FileMetadata(file_info['name'], file_info['size']) for file_info in file_infos] return MatchResult(path_pattern, metadata_list) exceptions = {} result = [] for url_pattern, limit in zip(url_patterns, limits): try: path_pattern = self._parse_url(url_pattern) result.append(_match(path_pattern, limit)) except Exception as e: # pylint: disable=broad-except exceptions[url_pattern] = e if exceptions: raise BeamIOError('Match operation failed', exceptions) return result def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res def create(self, url, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Returns: *hdfs3.core.HDFile*: An Python File-like object. """ path = self._parse_url(url) return self._create(path, mime_type, compression_type) def _create(self, path, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): return self._open_hdfs(path, 'wb', mime_type, compression_type) def open(self, url, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Returns: *hdfs3.core.HDFile*: An Python File-like object. """ path = self._parse_url(url) return self._open(path, mime_type, compression_type) def _open(self, path, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): return self._open_hdfs(path, 'rb', mime_type, compression_type) def copy(self, source_file_names, destination_file_names): """ Will overwrite files and directories in destination_file_names. Raises ``BeamIOError`` if any error occurred. Args: source_file_names: iterable of URLs. destination_file_names: iterable of URLs. """ if len(source_file_names) != len(destination_file_names): raise BeamIOError( 'source_file_names and destination_file_names should ' 'be equal in length: %d != %d' % ( len(source_file_names), len(destination_file_names))) def _copy_file(source, destination): with self._open(source) as f1: with self._create(destination) as f2: while True: buf = f1.read(_COPY_BUFFER_SIZE) if not buf: break f2.write(buf) def _copy_path(source, destination): """Recursively copy the file tree from the source to the destination.""" if not self._hdfs_client.isdir(source): _copy_file(source, destination) return for path, dirs, files in self._hdfs_client.walk(source): for dir in dirs: new_dir = self._join(destination, dir) if not self._exists(new_dir): self._mkdirs(new_dir) rel_path = posixpath.relpath(path, source) if rel_path == '.': rel_path = '' for file in files: _copy_file(self._join(path, file), self._join(destination, rel_path, file)) exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) _copy_path(rel_source, rel_destination) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Copy operation failed', exceptions) def rename(self, source_file_names, destination_file_names): exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) if not self._hdfs_client.mv(rel_source, rel_destination): raise BeamIOError( 'libhdfs error in renaming %s to %s' % (source, destination)) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Rename operation failed', exceptions) def exists(self, url): """Checks existence of url in HDFS. Args: url: String in the form hdfs://... Returns: True if url exists as a file or directory in HDFS. """ path = self._parse_url(url) return self._exists(path) def _exists(self, path): """Returns True if path exists as a file or directory in HDFS. Args: path: String in the form /... """ return self._hdfs_client.exists(path) def delete(self, urls): exceptions = {} for url in urls: try: path = self._parse_url(url) self._hdfs_client.rm(path, recursive=True) except Exception as e: # pylint: disable=broad-except exceptions[url] = e if exceptions: raise BeamIOError("Delete operation failed", exceptions)
class HDFSUtil(object): """ Reference: https://hdfs3.readthedocs.io/en/latest/api.html HDFS utility handles read, write, delete files """ def __init__(self): self.hdfs = HDFileSystem(host='localhost', port=9000) self.dest_path_tweet = '/user/BigData/tweet_data' self.dest_path_rss = '/user/BigData/rss_data' self.dest_path_corona = '/user/BigData/corona_data' self.destination_path = "" self.import_path = '../import_data' self.import_path_tweet = os.path.join(self.import_path, 'tweets') self.import_path_rss = os.path.join(self.import_path, 'rss') self.import_path_corona = os.path.join(self.import_path, 'corona') self.hdfs_types = {'tweet': self.dest_path_tweet, 'rss': self.dest_path_rss, 'corona': self.dest_path_corona} self.import_types = {'tweet': self.import_path_tweet, 'rss': self.import_path_rss, 'corona': self.import_path_corona} self.temp_types = {'tweet': "TempData/temp_tweet.csv", "rss": "TempData/temp_rss.csv", "corona": "TempData/temp_corona.csv"} def get_files(self, data_type): """ Return a list of files contain inside HDFS. :param data_type: in string only accept 'tweet', 'rss', 'corona' :return: list of files stored in HDFS """ if "tweet" == str(data_type).lower(): self.destination_path = self.dest_path_tweet elif "rss" == str(data_type).lower(): self.destination_path = self.dest_path_rss elif "corona" == str(data_type).lower(): self.destination_path = self.dest_path_corona else: raise Exception("Invalid data type, check if input string is correct.") if self.hdfs.exists(self.destination_path): return self.hdfs.ls(self.destination_path) def import_local_data(self, overwrite=False): """ Import files from local storage folder "import_data". Will print out files that is being push to HDFS. :return: None """ for data_type in self.import_types.keys(): try: hdfs_files = self.get_files(data_type) local_folder = self.import_types[data_type] onlyfiles = [f for f in listdir(local_folder) if isfile(join(local_folder, f) and ".~" not in f)] for file in onlyfiles: try: if overwrite: dest_path = self.hdfs_types[data_type] self.hdfs.put(os.path.join(local_folder, file), os.path.join(dest_path, file)) print(f"Write to HDFS: {os.path.join(dest_path, file)}") else: if sum([file in f for f in hdfs_files]) == 0 or len(hdfs_files) == 0: dest_path = self.hdfs_types[data_type] self.hdfs.put(os.path.join(local_folder, file), os.path.join(dest_path, file)) print(f"Write to HDFS: {os.path.join(dest_path, file)}") except Exception as e: print(str(e)) except Exception as e: print(str(e)) def delete_file(self, file_name, data_type=None): """ Attempt to delete file in HDFS by file name. :param file_name: Case sensitive :param data_type: Which data type to delete from 'tweet', 'rss', 'corona' :return: True if successful else False """ try: del_count = 0 for mdata_type in self.hdfs_types.keys(): if data_type: if mdata_type == data_type: hdfs_path = self.hdfs_types[mdata_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): self.hdfs.rm(hdfs_path) print(f"File deleted: {file_name}") del_count += 1 else: hdfs_path = self.hdfs_types[mdata_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): self.hdfs.rm(hdfs_path) print(f"File deleted: {file_name}") del_count += 1 if del_count == 0: print(f"Could not find file in HDFS: {file_name}") return False else: return True except Exception as e: print(str(e)) return False def is_file_exist(self, file_name): try: for data_type in self.hdfs_types.keys(): hdfs_path = self.hdfs_types[data_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): return True return False except Exception as e: print(str(e)) def read_file(self, file_name): """ Return the DataFrame load from HDFS :param file_name: Case sensitive :return: DataFrame """ try: for data_type in self.hdfs_types.keys(): hdfs_path = self.hdfs_types[data_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): with self.hdfs.open(hdfs_path) as f: df = pd.read_csv(f) return df except Exception as e: print(str(e)) def read_files(self, file_names): raise Exception("Not implemented") def write_file(self, source_file, file_name): try: for data_type in self.import_types.keys(): if data_type in source_file: dest_path = self.hdfs_types[data_type] # pushing to HDFS self.hdfs.put(source_file, os.path.join(dest_path, file_name)) print(f"Write to file: {os.path.join(dest_path, file_name)}") except Exception as e: print(str(e))
class HDFSUtil(object): """ Reference: https://hdfs3.readthedocs.io/en/latest/api.html HDFS utility handles read, write, delete files """ def __init__(self): self.hdfs = HDFileSystem(host='localhost', port=9000) self.dest_path_tweet = '/user/BigData/tweet_data' self.dest_path_rss = '/user/BigData/rss_data' self.dest_path_corona = '/user/BigData/corona_data' self.destination_path = "" self.import_path = '../import_data' self.import_path_tweet = os.path.join(self.import_path, 'tweets') self.import_path_rss = os.path.join(self.import_path, 'rss') self.import_path_corona = os.path.join(self.import_path, 'corona') self.hdfs_types = { 'tweet': self.dest_path_tweet, 'rss': self.dest_path_rss, 'corona': self.dest_path_corona } self.import_types = { 'tweet': self.import_path_tweet, 'rss': self.import_path_rss, 'corona': self.import_path_corona } self.temp_types = { 'tweet': "TempData/temp_tweet.csv", "rss": "TempData/temp_rss.csv", "corona": "TempData/temp_corona.csv" } def get_files(self, data_type): """ Return a list of files contain inside HDFS. :param data_type: in string only accept 'tweet', 'rss', 'corona' :return: list of files stored in HDFS """ if "tweet" == str(data_type).lower(): self.destination_path = self.dest_path_tweet elif "rss" == str(data_type).lower(): self.destination_path = self.dest_path_rss elif "corona" == str(data_type).lower(): self.destination_path = self.dest_path_corona else: raise Exception( "Invalid data type, check if input string is correct.") if self.hdfs.exists(self.destination_path): return self.hdfs.ls(self.destination_path) def import_local_data(self, overwrite=False): """ Import files from local storage folder "import_data". Will print out files that is being push to HDFS. :return: None """ for data_type in self.import_types.keys(): try: hdfs_files = self.get_files(data_type) local_folder = self.import_types[data_type] onlyfiles = [ f for f in listdir(local_folder) if isfile(os.path.join(local_folder, f)) and ".~" not in f ] for file in onlyfiles: try: if overwrite: dest_path = self.hdfs_types[data_type] self.hdfs.put(os.path.join(local_folder, file), os.path.join(dest_path, file)) print( f"Write to HDFS: {os.path.join(dest_path, file)}" ) else: if sum([file in f for f in hdfs_files ]) == 0 or len(hdfs_files) == 0: dest_path = self.hdfs_types[data_type] self.hdfs.put(os.path.join(local_folder, file), os.path.join(dest_path, file)) print( f"Write to HDFS: {os.path.join(dest_path, file)}" ) except Exception as e: print(str(e)) except Exception as e: print(str(e)) def delete_file(self, file_name, data_type=None): """ Attempt to delete file in HDFS by file name. :param file_name: Case sensitive :param data_type: Which data type to delete from 'tweet', 'rss', 'corona' :return: True if successful else False """ try: del_count = 0 for mdata_type in self.hdfs_types.keys(): if data_type: if mdata_type == data_type: hdfs_path = self.hdfs_types[mdata_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): self.hdfs.rm(hdfs_path) print(f"File deleted: {file_name}") del_count += 1 else: hdfs_path = self.hdfs_types[mdata_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): self.hdfs.rm(hdfs_path) print(f"File deleted: {file_name}") del_count += 1 if del_count == 0: print(f"Could not find file in HDFS: {file_name}") return False else: return True except Exception as e: print(str(e)) return False def delete_all_file(self): del_count = 0 for data_type in self.import_types.keys(): try: hdfs_files = self.get_files(data_type) for file in hdfs_files: if self.hdfs.exists(file): self.hdfs.rm(file) print(f"File deleted: {file}") del_count += 1 except Exception as e: print(str(e)) print(f"Total files deleted: {del_count}") def is_file_exist(self, file_name): try: for data_type in self.hdfs_types.keys(): hdfs_path = self.hdfs_types[data_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): return True return False except Exception as e: print(str(e)) def read_file_date(self, start_date, end_date=None, data_type="tweet"): """ Reading file from specific dates "start_date": Starting date inclusive, have to be in 31-03-2020 format. "end_date": Nullable field. Ending date inclusive, have to be in 31-03-2020 format. :param start_date: { "start_date": 31-03-2020 } :param end_date: { "start_date": 31-03-2020, "end_date": 30-04-2020 } :param data_type: "tweet", "rss" :return: pandas data frame """ start_date = datetime.strptime(start_date, "%d-%m-%Y") if end_date: end_date = datetime.strptime(end_date, "%d-%m-%Y") else: end_date = start_date # Getting all files with start end date files = self.get_files(data_type) files_datetime = [] for f in files: try: f = f.split("/")[-1] datetimeStr = f.split("_")[1] datetimeStr = datetimeStr.split(".")[0] date = datetime.strptime(datetimeStr, "%d-%m-%Y") files_datetime.append(date) except Exception as e: print(str(e)) # Append files to be loaded result_files = [] for file, date in zip(files, files_datetime): if start_date <= date <= end_date: result_files.append(file) if len(result_files) == 0: return None, None # Read and combine all dataframe max_files = 3 df_list = [] for mfile in result_files[0:max_files]: if self.hdfs.exists(mfile): with self.hdfs.open(mfile) as file: df = pd.read_csv(file) df_list.append(df) df = pd.concat(df_list) if data_type == "tweet": for idx, row in df.iterrows(): if str(row['extended_tweet']) != "nan": df['text'].iloc[idx] = row['extended_tweet'] p_schema = self.pandas_to_spark_schema(df) return df, p_schema def read_file_dataframe(self, file_name): """ Return the DataFrame load from HDFS :param file_name: Case sensitive :return: DataFrame, DataFrame_Schema """ try: for data_type in self.hdfs_types.keys(): hdfs_path = self.hdfs_types[data_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): with self.hdfs.open(hdfs_path) as f: df = pd.read_csv(f) p_schema = self.pandas_to_spark_schema(df) return df, p_schema except Exception as e: print(str(e)) def read_file(self, file_name): try: for data_type in self.hdfs_types.keys(): hdfs_path = self.hdfs_types[data_type] hdfs_path = os.path.join(hdfs_path, file_name) if self.hdfs.exists(hdfs_path): return hdfs_path except Exception as e: print(str(e)) # Auxiliar functions def equivalent_type(self, f): if f == 'datetime64[ns]': return DateType() elif f == 'int64': return LongType() elif f == 'int32': return IntegerType() elif f == 'float64': return FloatType() else: return StringType() def define_structure(self, string, format_type): try: typo = self.equivalent_type(format_type) except: typo = StringType() return StructField(string, typo) # Given pandas dataframe, it will return a spark's dataframe. def pandas_to_spark_schema(self, pandas_df): columns = list(pandas_df.columns) types = list(pandas_df.dtypes) struct_list = [] for column, typo in zip(columns, types): struct_list.append(self.define_structure(column, typo)) p_schema = StructType(struct_list) return p_schema def write_file(self, source_file, file_name): try: for data_type in self.import_types.keys(): if data_type in source_file: dest_path = self.hdfs_types[data_type] # pushing to HDFS self.hdfs.put(source_file, os.path.join(dest_path, file_name)) print( f"Write to file: {os.path.join(dest_path, file_name)}") except Exception as e: print(str(e))
class HadoopFileSystem(FileSystem): """``FileSystem`` implementation that supports HDFS. URL arguments to methods expect strings starting with ``hdfs://``. Uses client library :class:`hdfs3.core.HDFileSystem`. """ def __init__(self, pipeline_options): """Initializes a connection to HDFS. Connection configuration is done using :doc:`hdfs`. """ super(HadoopFileSystem, self).__init__(pipeline_options) self._hdfs_client = HDFileSystem() @classmethod def scheme(cls): return 'hdfs' @staticmethod def _parse_url(url): """Verifies that url begins with hdfs:// prefix, strips it and adds a leading /. Raises: ValueError if url doesn't begin with hdfs://. Args: url: A URL in the form hdfs://path/... Returns: For an input of 'hdfs://path/...', will return '/path/...'. """ m = _URL_RE.match(url) if m is None: raise ValueError('Could not parse url: %s' % url) return m.group(1) def join(self, base_url, *paths): """Join two or more pathname components. Args: base_url: string path of the first component of the path. Must start with hdfs://. paths: path components to be added Returns: Full url after combining all the passed components. """ basepath = self._parse_url(base_url) return _HDFS_PREFIX + self._join(basepath, *paths) def _join(self, basepath, *paths): return posixpath.join(basepath, *paths) def split(self, url): rel_path = self._parse_url(url) head, tail = posixpath.split(rel_path) return _HDFS_PREFIX + head, tail def mkdirs(self, url): path = self._parse_url(url) if self._exists(path): raise IOError('Path already exists: %s' % path) return self._mkdirs(path) def _mkdirs(self, path): self._hdfs_client.makedirs(path) def match(self, url_patterns, limits=None): if limits is None: limits = [None] * len(url_patterns) if len(url_patterns) != len(limits): raise BeamIOError( 'Patterns and limits should be equal in length: %d != %d' % (len(url_patterns), len(limits))) # TODO(udim): Update client to allow batched results. def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit] metadata_list = [ FileMetadata(file_info['name'], file_info['size']) for file_info in file_infos ] return MatchResult(path_pattern, metadata_list) exceptions = {} result = [] for url_pattern, limit in zip(url_patterns, limits): try: path_pattern = self._parse_url(url_pattern) result.append(_match(path_pattern, limit)) except Exception as e: # pylint: disable=broad-except exceptions[url_pattern] = e if exceptions: raise BeamIOError('Match operation failed', exceptions) return result def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning( 'Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res def create(self, url, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Returns: *hdfs3.core.HDFile*: An Python File-like object. """ path = self._parse_url(url) return self._create(path, mime_type, compression_type) def _create(self, path, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): return self._open_hdfs(path, 'wb', mime_type, compression_type) def open(self, url, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Returns: *hdfs3.core.HDFile*: An Python File-like object. """ path = self._parse_url(url) return self._open(path, mime_type, compression_type) def _open(self, path, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): return self._open_hdfs(path, 'rb', mime_type, compression_type) def copy(self, source_file_names, destination_file_names): """ Will overwrite files and directories in destination_file_names. Raises ``BeamIOError`` if any error occurred. Args: source_file_names: iterable of URLs. destination_file_names: iterable of URLs. """ if len(source_file_names) != len(destination_file_names): raise BeamIOError( 'source_file_names and destination_file_names should ' 'be equal in length: %d != %d' % (len(source_file_names), len(destination_file_names))) def _copy_file(source, destination): with self._open(source) as f1: with self._create(destination) as f2: while True: buf = f1.read(_COPY_BUFFER_SIZE) if not buf: break f2.write(buf) def _copy_path(source, destination): """Recursively copy the file tree from the source to the destination.""" if not self._hdfs_client.isdir(source): _copy_file(source, destination) return for path, dirs, files in self._hdfs_client.walk(source): for dir in dirs: new_dir = self._join(destination, dir) if not self._exists(new_dir): self._mkdirs(new_dir) rel_path = posixpath.relpath(path, source) if rel_path == '.': rel_path = '' for file in files: _copy_file(self._join(path, file), self._join(destination, rel_path, file)) exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) _copy_path(rel_source, rel_destination) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Copy operation failed', exceptions) def rename(self, source_file_names, destination_file_names): exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) if not self._hdfs_client.mv(rel_source, rel_destination): raise BeamIOError('libhdfs error in renaming %s to %s' % (source, destination)) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Rename operation failed', exceptions) def exists(self, url): """Checks existence of url in HDFS. Args: url: String in the form hdfs://... Returns: True if url exists as a file or directory in HDFS. """ path = self._parse_url(url) return self._exists(path) def _exists(self, path): """Returns True if path exists as a file or directory in HDFS. Args: path: String in the form /... """ return self._hdfs_client.exists(path) def delete(self, urls): exceptions = {} for url in urls: try: path = self._parse_url(url) self._hdfs_client.rm(path, recursive=True) except Exception as e: # pylint: disable=broad-except exceptions[url] = e if exceptions: raise BeamIOError("Delete operation failed", exceptions)
return open(path, mode) def put(self, src, dst): return shutil.copy(src, dst) if __name__ == "__main__": # load the hdfs node info f = open('hdfs.yml', 'r') data = yaml.load(f) f.close() hdfs_nn = data['hdfs_nn'] hdfs = HDFileSystem(host=hdfs_nn, port=data['hdfs_port']) tfs = TransparentFileSystem(hdfs) print hdfs.exists('/tmp') # print hdfs.hoge('/tmp') print tfs.exists('/tmp') # print tfs.hoge('/tmp') # tfs_local = TransparentFileSystem() # print tfs_local.glob('/var/tmp') print 'test' print tfs.glob('/tmp') # tfs.hoge() tfs_local = TransparentFileSystem() # print tfs_local.glob('/home/vagrant/work/data/*') # tfs_local.hoge() # print tfs.hoge()
class HdfsUtils(object): def __init__(self, host, port, user): self.hdfs = HDFileSystem(host=host, port=port, user=user) def hdfs_download(self, hdfs_path, local_path): ''' Download file or dir from hdfs :param hdfs_path: :param local_path: :return: ''' hdfs_path = os.path.normpath(hdfs_path) local_path = os.path.normpath(local_path) local_parent_path = os.path.dirname(local_path) if not self.hdfs.exists(hdfs_path): raise Exception('hdfs file not exists: ' + hdfs_path) if local_parent_path.strip() and not os.path.exists(local_parent_path): raise Exception('local parent folder not exists: ' + local_parent_path) if os.path.exists(local_path): raise Exception('local file exists: ' + local_path) if self.hdfs.isfile(hdfs_path): print('is file') self.hdfs.get(hdfs_path, local_path) elif self.hdfs.isdir(hdfs_path): print('is dir') os.mkdir(local_path) for (root, dirnames, filenames) in self.hdfs.walk(hdfs_path): relative_path = os.path.relpath(root, hdfs_path) for dirname in dirnames: current_local_dir_path = os.path.join( local_path, relative_path, dirname) os.makedirs(current_local_dir_path) for filename in filenames: current_hdfs_file_path = os.path.join(root, filename) current_local_file_path = os.path.join( local_path, relative_path, filename) self.hdfs.get(current_hdfs_file_path, current_local_file_path) else: raise Exception('parameters invalid') print('Done.') def hdfs_upload(self, local_path, hdfs_path): ''' Upload file or dir to hdfs :param local_path: :param hdfs_path: :return: ''' local_path = os.path.normpath(local_path) hdfs_path = os.path.normpath(hdfs_path) hdfs_parent_path = os.path.dirname(hdfs_path) if not os.path.exists(local_path): raise Exception('local file not exists: ' + local_path) if hdfs_parent_path.strip() and not self.hdfs.exists(hdfs_parent_path): raise Exception('hdfs parent folder not exists: ' + hdfs_parent_path) if self.hdfs.exists(hdfs_path): raise Exception('hdfs file exists: ' + hdfs_path) if os.path.isfile(local_path): print('is file') self.hdfs.put(local_path, hdfs_path) elif os.path.isdir(local_path): print('is dir') self.hdfs.mkdir(hdfs_path) for (root, dirnames, filenames) in os.walk(local_path): relative_path = os.path.relpath(root, local_path) for dirname in dirnames: current_hdfs_dir_path = os.path.join( hdfs_path, relative_path, dirname) self.hdfs.mkdir(current_hdfs_dir_path) for filename in filenames: if filename != '.gitignore': current_local_file_path = os.path.join(root, filename) current_hdfs_file_path = os.path.join( hdfs_path, relative_path, filename) self.hdfs.put(current_local_file_path, current_hdfs_file_path) else: raise Exception('parameters invalid') print('Done.') def hdfs_delete(self, hdfs_path): ''' Delete file or dir at hdfs :param hdfs_path: :param local_path: :return: ''' hdfs_path = os.path.normpath(hdfs_path) if self.hdfs.exists(hdfs_path): self.hdfs.rm(hdfs_path) print('Done.') def hdfs_mv(self, source_hdfs_path, target_hdfs_path): self.hdfs.mv(source_hdfs_path, target_hdfs_path)