def getAllFolderHaveData(client_: HdfsClient, path_: str): _folderPathList = [] for _root, _dir, _files in client_.walk(path_, status=True): # 有文件的内容的文件夹才是需要拷贝的文件夹 if len(_files) > 0: print(_root) _folderPathList.append(_root) return utils.listUtils.joinListToStr(_folderPathList, "\n")
class DeleteHdfsData(): # 初始化 def __init__(self): self.host = "172.27.133.18" self.port = "8020" self.userName = "******" self.remotePath = "/user/shiyouguandao" self.fs = HdfsClient(self.host, self.userName) self.ReadHdfsFile() # 读取文件 def ReadHdfsFile(self): count = self.HdfsFileList(self.remotePath) localtime = time.asctime(time.localtime(time.time())) log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) + "个csv文件...") # 文件删除 def DeleteHdfsFile(self, hdfsPath): # self.fs.delete(hdfsPath, skip_trash=False) self.fs.delete( "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv", skip_trash=False) #文件目录列表遍历判定 def HdfsFileList(self, path): count = 0 for root, dirs, files in self.fs.walk(path): for file in files: is_csv = self.hdfsFileHandler(file) if is_csv: self.DeleteHdfsFile(path + "/" + file) count += 1 return count #文件过滤 def hdfsFileHandler(self, fileName): if fileName.endswith(".csv"): temp = 60 * 60 * 24 index = fileName.rfind("_") str = fileName[index - len(fileName) - 10:index - len(fileName)] current = int(time.time()) fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d"))) if (current - fileTime) >= temp: return True else: return False
class HDBDataStore(object): """ Singleton class to read and maintain datasets for Service API Its not a generic HBase dataset handler. """ __metaclass__ = Singleton def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path): logging.info('Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no) # create connection pools try: self.conn_pool = happybase.ConnectionPool( DB_CONNECTION_POOL_SIZE, host=hbase_host, port=hbase_port_no, timeout=DB_CONNECTION_TIME_OUT) except TException as exception: logging.warn( "Exception throw for HBase Connection pool creation{%s}", exception.message) self.hbase_host = hbase_host self.hdfs_host = hdfs_host self.hbase_port_no = hbase_port_no self.table_name = table_name self.repo_path = repo_path self.master_dataset = list() self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs') def collect(self): """ Collect datasets by reading from HDFS Repo and HBase repo :return: """ hdfs_list = self.read_data_from_repo() hbase_list = self.retrieve_datasets_from_hbase() inter_list = list() # find intersection and keep hbase copy for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry) for hbase_entry in hbase_list for hdfs_entry in hdfs_list]: if hbase_entry['id'] == hdfs_entry['id']: # remove entries in HDFS list that matches hbase inter_list.append(hbase_entry) hdfs_list.remove(hdfs_entry) hbase_list.remove(hbase_entry) # yes intersection if len(inter_list) > 0: logging.debug("The intersection list:%s is", inter_list) self.master_dataset = inter_list + hdfs_list if len(hbase_list) != 0: logging.warn(" Warning Untracked datasets of size %d", len(hbase_list)) self.master_dataset = self.master_dataset + tag_for_integrity( hbase_list) else: # god knows whats happening self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list def read_data_from_repo(self): """ Read data from HDFS repo_path :return: """ repo_path = self.repo_path hdfs_dataset = list() try: for root, dirs, _ in self.client.walk(repo_path, topdown=True, onerror=onerror): for entry in dirs: m_source = re.match('^source=(?P<source>.*)', entry) if m_source is None: continue elif m_source.group('source') == '': logging.warn( 'An empty source is present, this is not allowed. Something was wrong during ingestion' ) continue else: item = { DATASET.ID: m_source.group('source'), DATASET.POLICY: POLICY.SIZE, DATASET.PATH: os.path.join(root, entry), DATASET.MODE: 'keep' } hdfs_dataset.append(item) break except HdfsException as exception: logging.warn("Error in walking HDFS File system %s", exception.message) return hdfs_dataset def retrieve_datasets_from_hbase(self): """ Connect to hbase table and return list of hbase_dataset :return: """ hbase_datasets = list() table_name = self.table_name try: with self.conn_pool.connection( DB_CONNECTION_TIME_OUT) as connection: if table_name not in connection.tables(): logging.info('creating hbase table %s', table_name) connection.create_table(table_name, {'cf': dict()}) table = connection.table(table_name) for _, data in table.scan(limit=1): logging.debug('%s found', table_name) except TException as exception: logging.warn(" failed to read table from hbase error(%s):", exception.message) return hbase_datasets logging.debug('connecting to hbase to read hbase_dataset') for key, data in table.scan(): item = { DATASET.ID: key, DATASET.PATH: data[DBSCHEMA.PATH], DATASET.POLICY: data[DBSCHEMA.POLICY], DATASET.MODE: data[DBSCHEMA.MODE] } if item[DATASET.POLICY] == POLICY.AGE: item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION]) elif item[DATASET.POLICY] == POLICY.SIZE: item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION]) hbase_datasets.append(item) logging.info(hbase_datasets) return hbase_datasets def read_datasets(self): """ Connect to hbase table and return list of datasets :return: """ return self.master_dataset def read_partitions(self, data_path): """ Read partition for a HDFS dataset :param data_path: :return: """ data_parts = list() try: for entry in dirwalk(self.client, data_path): if entry not in data_parts: data_parts.append(entry) except HdfsException as exception: logging.warn( "Error in walking HDFS File system for partitions errormsg:%s", exception.message) return data_parts def write_dataset(self, data): """ Persist dataset entry into HBase Table :param data: api that needs update :return: None """ try: logging.debug("Write dataset:{%s}", data) table_name = self.table_name with self.conn_pool.connection( DB_CONNECTION_TIME_OUT) as connection: table = connection.table(table_name) dataset = { DBSCHEMA.PATH: data[DATASET.PATH], DBSCHEMA.POLICY: data[DATASET.POLICY], DBSCHEMA.MODE: data[DATASET.MODE] } if DATASET.RETENTION in data: dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION] logging.debug("calling put on table for %s", dataset) table.put(data[DATASET.ID], dataset) except TException as exception: logging.warn("Failed to write dataset into hbase, error(%s):", exception.message) def delete_dataset(self, data): """ Delete dataset entry from HBase. :param data: dataset instance :return: None """ try: table_name = self.table_name with self.conn_pool.connection( DB_CONNECTION_TIME_OUT) as connection: table = connection.table(table_name) logging.debug("Deleting dataset from HBase:{%s}", data) table.delete(data['id']) except TException as exception: logging.warn("Failed to delete dataset in hbase, error(%s):", exception.message)
def showAllDirAndFiles(client_: HdfsClient, path_: str): for _root, _dir, _files in client_.walk(path_, status=True): print('_root = ' + str(_root)) print('_dir = ' + str(_dir)) print('_files = ' + str(_files))
class HDBDataStore(object): """ Singleton class to read and maintain datasets for Service API Its not a generic HBase dataset handler. """ __metaclass__ = Singleton def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path): logging.info( 'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no) # create connection pools try: self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host, port=hbase_port_no, timeout=DB_CONNECTION_TIME_OUT) except TException as exception: logging.warn("Exception throw for HBase Connection pool creation{%s}", exception.message) self.hbase_host = hbase_host self.hdfs_host = hdfs_host self.hbase_port_no = hbase_port_no self.table_name = table_name self.repo_path = repo_path self.master_dataset = list() self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs') def collect(self): """ Collect datasets by reading from HDFS Repo and HBase repo :return: """ hdfs_list = self.read_data_from_repo() hbase_list = self.retrieve_datasets_from_hbase() inter_list = list() # find intersection and keep hbase copy for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry) for hbase_entry in hbase_list for hdfs_entry in hdfs_list]: if hbase_entry['id'] == hdfs_entry['id']: # remove entries in HDFS list that matches hbase inter_list.append(hbase_entry) hdfs_list.remove(hdfs_entry) hbase_list.remove(hbase_entry) # yes intersection if len(inter_list) > 0: logging.debug("The intersection list:%s is", inter_list) self.master_dataset = inter_list + hdfs_list if len(hbase_list) != 0: logging.warn(" Warning Untracked datasets of size %d", len(hbase_list)) self.master_dataset = self.master_dataset + tag_for_integrity(hbase_list) else: # god knows whats happening self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list def read_data_from_repo(self): """ Read data from HDFS repo_path :return: """ repo_path = self.repo_path hdfs_dataset = list() try: for root, dirs, _ in self.client.walk(repo_path, topdown=True, onerror=onerror): for entry in dirs: m_source = re.match('^source=(?P<source>.*)', entry) if m_source is None: continue elif m_source.group('source') == '': logging.warn('An empty source is present, this is not allowed. Something was wrong during ingestion') continue else: item = {DATASET.ID: m_source.group('source'), DATASET.POLICY: POLICY.SIZE, DATASET.PATH: os.path.join(root, entry), DATASET.MODE: 'keep'} hdfs_dataset.append(item) break except HdfsException as exception: logging.warn("Error in walking HDFS File system %s", exception.message) return hdfs_dataset def retrieve_datasets_from_hbase(self): """ Connect to hbase table and return list of hbase_dataset :return: """ hbase_datasets = list() table_name = self.table_name try: with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection: if table_name not in connection.tables(): logging.info('creating hbase table %s', table_name) connection.create_table(table_name, {'cf': dict()}) table = connection.table(table_name) for _, data in table.scan(limit=1): logging.debug('%s found', table_name) except TException as exception: logging.warn(" failed to read table from hbase error(%s):", exception.message) return hbase_datasets logging.debug('connecting to hbase to read hbase_dataset') for key, data in table.scan(): item = {DATASET.ID: key, DATASET.PATH: data[DBSCHEMA.PATH], DATASET.POLICY: data[DBSCHEMA.POLICY], DATASET.MODE: data[DBSCHEMA.MODE]} if item[DATASET.POLICY] == POLICY.AGE: item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION]) elif item[DATASET.POLICY] == POLICY.SIZE: item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION]) hbase_datasets.append(item) logging.info(hbase_datasets) return hbase_datasets def read_datasets(self): """ Connect to hbase table and return list of datasets :return: """ return self.master_dataset def read_partitions(self, data_path): """ Read partition for a HDFS dataset :param data_path: :return: """ data_parts = list() try: for entry in dirwalk(self.client, data_path): if entry not in data_parts: data_parts.append(entry) except HdfsException as exception: logging.warn( "Error in walking HDFS File system for partitions errormsg:%s", exception.message) return data_parts def write_dataset(self, data): """ Persist dataset entry into HBase Table :param data: api that needs update :return: None """ try: logging.debug("Write dataset:{%s}", data) table_name = self.table_name with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection: table = connection.table(table_name) dataset = {DBSCHEMA.PATH: data[DATASET.PATH], DBSCHEMA.POLICY: data[DATASET.POLICY], DBSCHEMA.MODE: data[DATASET.MODE]} if DATASET.RETENTION in data: dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION] logging.debug("calling put on table for %s", dataset) table.put(data[DATASET.ID], dataset) except TException as exception: logging.warn("Failed to write dataset into hbase, error(%s):", exception.message) def delete_dataset(self, data): """ Delete dataset entry from HBase. :param data: dataset instance :return: None """ try: table_name = self.table_name with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection: table = connection.table(table_name) logging.debug("Deleting dataset from HBase:{%s}", data) table.delete(data['id']) except TException as exception: logging.warn("Failed to delete dataset in hbase, error(%s):", exception.message)