Python HdfsClient.walk Examples

Programming Language: Python

Namespace/Package Name: pyhdfs

Class/Type: HdfsClient

Method/Function: walk

Examples at hotexamples.com: 5

Python HdfsClient.walk - 5 examples found. These are the top rated real world Python examples of pyhdfs.HdfsClient.walk extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HdfsClient(30)

open(18)

listdir(17)

exists(16)

delete(13)

mkdirs(12)

create(10)

copy_from_local(8)

copy_to_local(5)

walk(4)

list_status(3)

get_active_namenode(3)

get_home_directory(2)

append(2)

get_content_summary(2)

get_file_status(1)

get_file_checksum(1)

Example #1

Show file

File: dataUtils_HDFS.py Project: jiasy/PY_Service

def getAllFolderHaveData(client_: HdfsClient, path_: str):
    _folderPathList = []
    for _root, _dir, _files in client_.walk(path_, status=True):
        # 有文件的内容的文件夹才是需要拷贝的文件夹
        if len(_files) > 0:
            print(_root)
            _folderPathList.append(_root)
    return utils.listUtils.joinListToStr(_folderPathList, "\n")

Example #2

Show file

class DeleteHdfsData():

    # 初始化
    def __init__(self):
        self.host = "172.27.133.18"
        self.port = "8020"
        self.userName = "******"
        self.remotePath = "/user/shiyouguandao"
        self.fs = HdfsClient(self.host, self.userName)
        self.ReadHdfsFile()

    # 读取文件
    def ReadHdfsFile(self):
        count = self.HdfsFileList(self.remotePath)
        localtime = time.asctime(time.localtime(time.time()))
        log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) +
                 "个csv文件...")

    # 文件删除
    def DeleteHdfsFile(self, hdfsPath):
        # self.fs.delete(hdfsPath, skip_trash=False)
        self.fs.delete(
            "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv",
            skip_trash=False)

    #文件目录列表遍历判定
    def HdfsFileList(self, path):
        count = 0
        for root, dirs, files in self.fs.walk(path):
            for file in files:
                is_csv = self.hdfsFileHandler(file)
                if is_csv:
                    self.DeleteHdfsFile(path + "/" + file)
                    count += 1
        return count

    #文件过滤
    def hdfsFileHandler(self, fileName):
        if fileName.endswith(".csv"):
            temp = 60 * 60 * 24
            index = fileName.rfind("_")
            str = fileName[index - len(fileName) - 10:index - len(fileName)]
            current = int(time.time())
            fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d")))
            if (current - fileTime) >= temp:
                return True
            else:
                return False

Example #3

Show file

class HDBDataStore(object):
    """
    Singleton class to read and maintain datasets for Service API
    Its not a generic HBase dataset handler.
    """
    __metaclass__ = Singleton

    def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name,
                 repo_path):
        logging.info('Open connection pool for hbase host:%s port:%d',
                     hbase_host, hbase_port_no)
        # create connection pools
        try:
            self.conn_pool = happybase.ConnectionPool(
                DB_CONNECTION_POOL_SIZE,
                host=hbase_host,
                port=hbase_port_no,
                timeout=DB_CONNECTION_TIME_OUT)
        except TException as exception:
            logging.warn(
                "Exception throw for HBase Connection pool creation{%s}",
                exception.message)
        self.hbase_host = hbase_host
        self.hdfs_host = hdfs_host
        self.hbase_port_no = hbase_port_no
        self.table_name = table_name
        self.repo_path = repo_path
        self.master_dataset = list()
        self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')

    def collect(self):
        """
        Collect datasets by reading from HDFS Repo and HBase repo
        :return:
        """
        hdfs_list = self.read_data_from_repo()
        hbase_list = self.retrieve_datasets_from_hbase()
        inter_list = list()
        # find intersection and keep hbase copy
        for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry)
                                        for hbase_entry in hbase_list
                                        for hdfs_entry in hdfs_list]:
            if hbase_entry['id'] == hdfs_entry['id']:
                # remove entries in HDFS list that matches hbase
                inter_list.append(hbase_entry)
                hdfs_list.remove(hdfs_entry)
                hbase_list.remove(hbase_entry)
        # yes intersection
        if len(inter_list) > 0:
            logging.debug("The intersection list:%s is", inter_list)
            self.master_dataset = inter_list + hdfs_list
            if len(hbase_list) != 0:
                logging.warn(" Warning Untracked datasets of size %d",
                             len(hbase_list))
                self.master_dataset = self.master_dataset + tag_for_integrity(
                    hbase_list)
        else:
            # god knows whats happening
            self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list

    def read_data_from_repo(self):
        """
        Read data from HDFS repo_path
        :return:
        """
        repo_path = self.repo_path
        hdfs_dataset = list()
        try:
            for root, dirs, _ in self.client.walk(repo_path,
                                                  topdown=True,
                                                  onerror=onerror):
                for entry in dirs:
                    m_source = re.match('^source=(?P<source>.*)', entry)
                    if m_source is None:
                        continue
                    elif m_source.group('source') == '':
                        logging.warn(
                            'An empty source is present, this is not allowed. Something was wrong during ingestion'
                        )
                        continue
                    else:
                        item = {
                            DATASET.ID: m_source.group('source'),
                            DATASET.POLICY: POLICY.SIZE,
                            DATASET.PATH: os.path.join(root, entry),
                            DATASET.MODE: 'keep'
                        }
                        hdfs_dataset.append(item)
                break
        except HdfsException as exception:
            logging.warn("Error in walking HDFS File system %s",
                         exception.message)
        return hdfs_dataset

    def retrieve_datasets_from_hbase(self):
        """
        Connect to hbase table and return list of hbase_dataset
        :return:
        """
        hbase_datasets = list()
        table_name = self.table_name
        try:
            with self.conn_pool.connection(
                    DB_CONNECTION_TIME_OUT) as connection:
                if table_name not in connection.tables():
                    logging.info('creating hbase table %s', table_name)
                    connection.create_table(table_name, {'cf': dict()})

                table = connection.table(table_name)
                for _, data in table.scan(limit=1):
                    logging.debug('%s found', table_name)
        except TException as exception:
            logging.warn(" failed to read table from hbase error(%s):",
                         exception.message)
            return hbase_datasets
        logging.debug('connecting to hbase to read hbase_dataset')
        for key, data in table.scan():
            item = {
                DATASET.ID: key,
                DATASET.PATH: data[DBSCHEMA.PATH],
                DATASET.POLICY: data[DBSCHEMA.POLICY],
                DATASET.MODE: data[DBSCHEMA.MODE]
            }
            if item[DATASET.POLICY] == POLICY.AGE:
                item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION])
            elif item[DATASET.POLICY] == POLICY.SIZE:
                item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION])
            hbase_datasets.append(item)
        logging.info(hbase_datasets)
        return hbase_datasets

    def read_datasets(self):
        """
        Connect to hbase table and return list of datasets
        :return:
        """
        return self.master_dataset

    def read_partitions(self, data_path):
        """
        Read partition for a HDFS dataset
        :param data_path:
        :return:
        """
        data_parts = list()
        try:
            for entry in dirwalk(self.client, data_path):
                if entry not in data_parts:
                    data_parts.append(entry)
        except HdfsException as exception:
            logging.warn(
                "Error in walking HDFS File system for partitions errormsg:%s",
                exception.message)
        return data_parts

    def write_dataset(self, data):
        """
        Persist dataset entry into HBase Table
        :param data: api that needs update
        :return: None
        """
        try:
            logging.debug("Write dataset:{%s}", data)
            table_name = self.table_name
            with self.conn_pool.connection(
                    DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                dataset = {
                    DBSCHEMA.PATH: data[DATASET.PATH],
                    DBSCHEMA.POLICY: data[DATASET.POLICY],
                    DBSCHEMA.MODE: data[DATASET.MODE]
                }
                if DATASET.RETENTION in data:
                    dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION]
                logging.debug("calling put on table for %s", dataset)
                table.put(data[DATASET.ID], dataset)
        except TException as exception:
            logging.warn("Failed to write dataset into hbase,  error(%s):",
                         exception.message)

    def delete_dataset(self, data):
        """
        Delete dataset entry from HBase.
        :param data: dataset instance
        :return: None
        """
        try:
            table_name = self.table_name
            with self.conn_pool.connection(
                    DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                logging.debug("Deleting dataset from HBase:{%s}", data)
                table.delete(data['id'])
        except TException as exception:
            logging.warn("Failed to delete dataset in hbase,  error(%s):",
                         exception.message)

Example #4

Show file

File: dataUtils_HDFS.py Project: jiasy/PY_Service

def showAllDirAndFiles(client_: HdfsClient, path_: str):
    for _root, _dir, _files in client_.walk(path_, status=True):
        print('_root = ' + str(_root))
        print('_dir = ' + str(_dir))
        print('_files = ' + str(_files))

Example #5

Show file

File: hdb.py Project: pndaproject/platform-data-mgmnt

class HDBDataStore(object):
    """
    Singleton class to read and maintain datasets for Service API
    Its not a generic HBase dataset handler.
    """
    __metaclass__ = Singleton
    def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path):
        logging.info(
            'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no)
        # create connection pools
        try:
            self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host,
                                                      port=hbase_port_no,
                                                      timeout=DB_CONNECTION_TIME_OUT)
        except TException as exception:
            logging.warn("Exception throw for HBase Connection pool creation{%s}",
                         exception.message)
        self.hbase_host = hbase_host
        self.hdfs_host = hdfs_host
        self.hbase_port_no = hbase_port_no
        self.table_name = table_name
        self.repo_path = repo_path
        self.master_dataset = list()
        self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')

    def collect(self):
        """
        Collect datasets by reading from HDFS Repo and HBase repo
        :return:
        """
        hdfs_list = self.read_data_from_repo()
        hbase_list = self.retrieve_datasets_from_hbase()
        inter_list = list()
        # find intersection and keep hbase copy
        for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry) for hbase_entry in hbase_list
                                        for hdfs_entry in hdfs_list]:
            if hbase_entry['id'] == hdfs_entry['id']:
                # remove entries in HDFS list that matches hbase
                inter_list.append(hbase_entry)
                hdfs_list.remove(hdfs_entry)
                hbase_list.remove(hbase_entry)
        # yes intersection
        if len(inter_list) > 0:
            logging.debug("The intersection list:%s is", inter_list)
            self.master_dataset = inter_list + hdfs_list
            if len(hbase_list) != 0:
                logging.warn(" Warning Untracked datasets of size %d", len(hbase_list))
                self.master_dataset = self.master_dataset + tag_for_integrity(hbase_list)
        else:
            # god knows whats happening
            self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list

    def read_data_from_repo(self):
        """
        Read data from HDFS repo_path
        :return:
        """
        repo_path = self.repo_path
        hdfs_dataset = list()
        try:
            for root, dirs, _ in self.client.walk(repo_path, topdown=True, onerror=onerror):
                for entry in dirs:
                    m_source = re.match('^source=(?P<source>.*)', entry)
                    if m_source is None:
                        continue
                    elif m_source.group('source') == '':
                        logging.warn('An empty source is present, this is not allowed. Something was wrong during ingestion')
                        continue
                    else:
                        item = {DATASET.ID: m_source.group('source'),
                                DATASET.POLICY: POLICY.SIZE,
                                DATASET.PATH: os.path.join(root, entry), DATASET.MODE: 'keep'}
                        hdfs_dataset.append(item)
                break
        except HdfsException as exception:
            logging.warn("Error in walking HDFS File system %s", exception.message)
        return hdfs_dataset

    def retrieve_datasets_from_hbase(self):
        """
        Connect to hbase table and return list of hbase_dataset
        :return:
        """
        hbase_datasets = list()
        table_name = self.table_name
        try:
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                if table_name not in connection.tables():
                    logging.info('creating hbase table %s', table_name)
                    connection.create_table(table_name, {'cf': dict()})

                table = connection.table(table_name)
                for _, data in table.scan(limit=1):
                    logging.debug('%s found', table_name)
        except TException as exception:
            logging.warn(" failed to read table from hbase error(%s):", exception.message)
            return hbase_datasets
        logging.debug('connecting to hbase to read hbase_dataset')
        for key, data in table.scan():
            item = {DATASET.ID: key, DATASET.PATH: data[DBSCHEMA.PATH],
                    DATASET.POLICY: data[DBSCHEMA.POLICY],
                    DATASET.MODE: data[DBSCHEMA.MODE]}
            if item[DATASET.POLICY] == POLICY.AGE:
                item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION])
            elif item[DATASET.POLICY] == POLICY.SIZE:
                item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION])
            hbase_datasets.append(item)
        logging.info(hbase_datasets)
        return hbase_datasets

    def read_datasets(self):
        """
        Connect to hbase table and return list of datasets
        :return:
        """
        return self.master_dataset

    def read_partitions(self, data_path):
        """
        Read partition for a HDFS dataset
        :param data_path:
        :return:
        """
        data_parts = list()
        try:
            for entry in dirwalk(self.client, data_path):
                if entry not in data_parts:
                    data_parts.append(entry)
        except HdfsException as exception:
            logging.warn(
                "Error in walking HDFS File system for partitions errormsg:%s", exception.message)
        return data_parts

    def write_dataset(self, data):
        """
        Persist dataset entry into HBase Table
        :param data: api that needs update
        :return: None
        """
        try:
            logging.debug("Write dataset:{%s}", data)
            table_name = self.table_name
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                dataset = {DBSCHEMA.PATH: data[DATASET.PATH], DBSCHEMA.POLICY: data[DATASET.POLICY],
                           DBSCHEMA.MODE: data[DATASET.MODE]}
                if DATASET.RETENTION in data:
                    dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION]
                logging.debug("calling put on table for %s", dataset)
                table.put(data[DATASET.ID], dataset)
        except TException as exception:
            logging.warn("Failed to write dataset into hbase,  error(%s):", exception.message)

    def delete_dataset(self, data):
        """
        Delete dataset entry from HBase.
        :param data: dataset instance
        :return: None
        """
        try:
            table_name = self.table_name
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                logging.debug("Deleting dataset from HBase:{%s}", data)
                table.delete(data['id'])
        except TException as exception:
            logging.warn("Failed to delete dataset in hbase,  error(%s):", exception.message)