Exemple #1
0
def getAllFolderHaveData(client_: HdfsClient, path_: str):
    _folderPathList = []
    for _root, _dir, _files in client_.walk(path_, status=True):
        # 有文件的内容的文件夹才是需要拷贝的文件夹
        if len(_files) > 0:
            print(_root)
            _folderPathList.append(_root)
    return utils.listUtils.joinListToStr(_folderPathList, "\n")
Exemple #2
0
class DeleteHdfsData():

    # 初始化
    def __init__(self):
        self.host = "172.27.133.18"
        self.port = "8020"
        self.userName = "******"
        self.remotePath = "/user/shiyouguandao"
        self.fs = HdfsClient(self.host, self.userName)
        self.ReadHdfsFile()

    # 读取文件
    def ReadHdfsFile(self):
        count = self.HdfsFileList(self.remotePath)
        localtime = time.asctime(time.localtime(time.time()))
        log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) +
                 "个csv文件...")

    # 文件删除
    def DeleteHdfsFile(self, hdfsPath):
        # self.fs.delete(hdfsPath, skip_trash=False)
        self.fs.delete(
            "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv",
            skip_trash=False)

    #文件目录列表遍历判定
    def HdfsFileList(self, path):
        count = 0
        for root, dirs, files in self.fs.walk(path):
            for file in files:
                is_csv = self.hdfsFileHandler(file)
                if is_csv:
                    self.DeleteHdfsFile(path + "/" + file)
                    count += 1
        return count

    #文件过滤
    def hdfsFileHandler(self, fileName):
        if fileName.endswith(".csv"):
            temp = 60 * 60 * 24
            index = fileName.rfind("_")
            str = fileName[index - len(fileName) - 10:index - len(fileName)]
            current = int(time.time())
            fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d")))
            if (current - fileTime) >= temp:
                return True
            else:
                return False
Exemple #3
0
class HDBDataStore(object):
    """
    Singleton class to read and maintain datasets for Service API
    Its not a generic HBase dataset handler.
    """
    __metaclass__ = Singleton

    def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name,
                 repo_path):
        logging.info('Open connection pool for hbase host:%s port:%d',
                     hbase_host, hbase_port_no)
        # create connection pools
        try:
            self.conn_pool = happybase.ConnectionPool(
                DB_CONNECTION_POOL_SIZE,
                host=hbase_host,
                port=hbase_port_no,
                timeout=DB_CONNECTION_TIME_OUT)
        except TException as exception:
            logging.warn(
                "Exception throw for HBase Connection pool creation{%s}",
                exception.message)
        self.hbase_host = hbase_host
        self.hdfs_host = hdfs_host
        self.hbase_port_no = hbase_port_no
        self.table_name = table_name
        self.repo_path = repo_path
        self.master_dataset = list()
        self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')

    def collect(self):
        """
        Collect datasets by reading from HDFS Repo and HBase repo
        :return:
        """
        hdfs_list = self.read_data_from_repo()
        hbase_list = self.retrieve_datasets_from_hbase()
        inter_list = list()
        # find intersection and keep hbase copy
        for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry)
                                        for hbase_entry in hbase_list
                                        for hdfs_entry in hdfs_list]:
            if hbase_entry['id'] == hdfs_entry['id']:
                # remove entries in HDFS list that matches hbase
                inter_list.append(hbase_entry)
                hdfs_list.remove(hdfs_entry)
                hbase_list.remove(hbase_entry)
        # yes intersection
        if len(inter_list) > 0:
            logging.debug("The intersection list:%s is", inter_list)
            self.master_dataset = inter_list + hdfs_list
            if len(hbase_list) != 0:
                logging.warn(" Warning Untracked datasets of size %d",
                             len(hbase_list))
                self.master_dataset = self.master_dataset + tag_for_integrity(
                    hbase_list)
        else:
            # god knows whats happening
            self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list

    def read_data_from_repo(self):
        """
        Read data from HDFS repo_path
        :return:
        """
        repo_path = self.repo_path
        hdfs_dataset = list()
        try:
            for root, dirs, _ in self.client.walk(repo_path,
                                                  topdown=True,
                                                  onerror=onerror):
                for entry in dirs:
                    m_source = re.match('^source=(?P<source>.*)', entry)
                    if m_source is None:
                        continue
                    elif m_source.group('source') == '':
                        logging.warn(
                            'An empty source is present, this is not allowed. Something was wrong during ingestion'
                        )
                        continue
                    else:
                        item = {
                            DATASET.ID: m_source.group('source'),
                            DATASET.POLICY: POLICY.SIZE,
                            DATASET.PATH: os.path.join(root, entry),
                            DATASET.MODE: 'keep'
                        }
                        hdfs_dataset.append(item)
                break
        except HdfsException as exception:
            logging.warn("Error in walking HDFS File system %s",
                         exception.message)
        return hdfs_dataset

    def retrieve_datasets_from_hbase(self):
        """
        Connect to hbase table and return list of hbase_dataset
        :return:
        """
        hbase_datasets = list()
        table_name = self.table_name
        try:
            with self.conn_pool.connection(
                    DB_CONNECTION_TIME_OUT) as connection:
                if table_name not in connection.tables():
                    logging.info('creating hbase table %s', table_name)
                    connection.create_table(table_name, {'cf': dict()})

                table = connection.table(table_name)
                for _, data in table.scan(limit=1):
                    logging.debug('%s found', table_name)
        except TException as exception:
            logging.warn(" failed to read table from hbase error(%s):",
                         exception.message)
            return hbase_datasets
        logging.debug('connecting to hbase to read hbase_dataset')
        for key, data in table.scan():
            item = {
                DATASET.ID: key,
                DATASET.PATH: data[DBSCHEMA.PATH],
                DATASET.POLICY: data[DBSCHEMA.POLICY],
                DATASET.MODE: data[DBSCHEMA.MODE]
            }
            if item[DATASET.POLICY] == POLICY.AGE:
                item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION])
            elif item[DATASET.POLICY] == POLICY.SIZE:
                item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION])
            hbase_datasets.append(item)
        logging.info(hbase_datasets)
        return hbase_datasets

    def read_datasets(self):
        """
        Connect to hbase table and return list of datasets
        :return:
        """
        return self.master_dataset

    def read_partitions(self, data_path):
        """
        Read partition for a HDFS dataset
        :param data_path:
        :return:
        """
        data_parts = list()
        try:
            for entry in dirwalk(self.client, data_path):
                if entry not in data_parts:
                    data_parts.append(entry)
        except HdfsException as exception:
            logging.warn(
                "Error in walking HDFS File system for partitions errormsg:%s",
                exception.message)
        return data_parts

    def write_dataset(self, data):
        """
        Persist dataset entry into HBase Table
        :param data: api that needs update
        :return: None
        """
        try:
            logging.debug("Write dataset:{%s}", data)
            table_name = self.table_name
            with self.conn_pool.connection(
                    DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                dataset = {
                    DBSCHEMA.PATH: data[DATASET.PATH],
                    DBSCHEMA.POLICY: data[DATASET.POLICY],
                    DBSCHEMA.MODE: data[DATASET.MODE]
                }
                if DATASET.RETENTION in data:
                    dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION]
                logging.debug("calling put on table for %s", dataset)
                table.put(data[DATASET.ID], dataset)
        except TException as exception:
            logging.warn("Failed to write dataset into hbase,  error(%s):",
                         exception.message)

    def delete_dataset(self, data):
        """
        Delete dataset entry from HBase.
        :param data: dataset instance
        :return: None
        """
        try:
            table_name = self.table_name
            with self.conn_pool.connection(
                    DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                logging.debug("Deleting dataset from HBase:{%s}", data)
                table.delete(data['id'])
        except TException as exception:
            logging.warn("Failed to delete dataset in hbase,  error(%s):",
                         exception.message)
Exemple #4
0
def showAllDirAndFiles(client_: HdfsClient, path_: str):
    for _root, _dir, _files in client_.walk(path_, status=True):
        print('_root = ' + str(_root))
        print('_dir = ' + str(_dir))
        print('_files = ' + str(_files))
class HDBDataStore(object):
    """
    Singleton class to read and maintain datasets for Service API
    Its not a generic HBase dataset handler.
    """
    __metaclass__ = Singleton
    def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path):
        logging.info(
            'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no)
        # create connection pools
        try:
            self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host,
                                                      port=hbase_port_no,
                                                      timeout=DB_CONNECTION_TIME_OUT)
        except TException as exception:
            logging.warn("Exception throw for HBase Connection pool creation{%s}",
                         exception.message)
        self.hbase_host = hbase_host
        self.hdfs_host = hdfs_host
        self.hbase_port_no = hbase_port_no
        self.table_name = table_name
        self.repo_path = repo_path
        self.master_dataset = list()
        self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')

    def collect(self):
        """
        Collect datasets by reading from HDFS Repo and HBase repo
        :return:
        """
        hdfs_list = self.read_data_from_repo()
        hbase_list = self.retrieve_datasets_from_hbase()
        inter_list = list()
        # find intersection and keep hbase copy
        for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry) for hbase_entry in hbase_list
                                        for hdfs_entry in hdfs_list]:
            if hbase_entry['id'] == hdfs_entry['id']:
                # remove entries in HDFS list that matches hbase
                inter_list.append(hbase_entry)
                hdfs_list.remove(hdfs_entry)
                hbase_list.remove(hbase_entry)
        # yes intersection
        if len(inter_list) > 0:
            logging.debug("The intersection list:%s is", inter_list)
            self.master_dataset = inter_list + hdfs_list
            if len(hbase_list) != 0:
                logging.warn(" Warning Untracked datasets of size %d", len(hbase_list))
                self.master_dataset = self.master_dataset + tag_for_integrity(hbase_list)
        else:
            # god knows whats happening
            self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list

    def read_data_from_repo(self):
        """
        Read data from HDFS repo_path
        :return:
        """
        repo_path = self.repo_path
        hdfs_dataset = list()
        try:
            for root, dirs, _ in self.client.walk(repo_path, topdown=True, onerror=onerror):
                for entry in dirs:
                    m_source = re.match('^source=(?P<source>.*)', entry)
                    if m_source is None:
                        continue
                    elif m_source.group('source') == '':
                        logging.warn('An empty source is present, this is not allowed. Something was wrong during ingestion')
                        continue
                    else:
                        item = {DATASET.ID: m_source.group('source'),
                                DATASET.POLICY: POLICY.SIZE,
                                DATASET.PATH: os.path.join(root, entry), DATASET.MODE: 'keep'}
                        hdfs_dataset.append(item)
                break
        except HdfsException as exception:
            logging.warn("Error in walking HDFS File system %s", exception.message)
        return hdfs_dataset

    def retrieve_datasets_from_hbase(self):
        """
        Connect to hbase table and return list of hbase_dataset
        :return:
        """
        hbase_datasets = list()
        table_name = self.table_name
        try:
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                if table_name not in connection.tables():
                    logging.info('creating hbase table %s', table_name)
                    connection.create_table(table_name, {'cf': dict()})

                table = connection.table(table_name)
                for _, data in table.scan(limit=1):
                    logging.debug('%s found', table_name)
        except TException as exception:
            logging.warn(" failed to read table from hbase error(%s):", exception.message)
            return hbase_datasets
        logging.debug('connecting to hbase to read hbase_dataset')
        for key, data in table.scan():
            item = {DATASET.ID: key, DATASET.PATH: data[DBSCHEMA.PATH],
                    DATASET.POLICY: data[DBSCHEMA.POLICY],
                    DATASET.MODE: data[DBSCHEMA.MODE]}
            if item[DATASET.POLICY] == POLICY.AGE:
                item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION])
            elif item[DATASET.POLICY] == POLICY.SIZE:
                item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION])
            hbase_datasets.append(item)
        logging.info(hbase_datasets)
        return hbase_datasets

    def read_datasets(self):
        """
        Connect to hbase table and return list of datasets
        :return:
        """
        return self.master_dataset

    def read_partitions(self, data_path):
        """
        Read partition for a HDFS dataset
        :param data_path:
        :return:
        """
        data_parts = list()
        try:
            for entry in dirwalk(self.client, data_path):
                if entry not in data_parts:
                    data_parts.append(entry)
        except HdfsException as exception:
            logging.warn(
                "Error in walking HDFS File system for partitions errormsg:%s", exception.message)
        return data_parts

    def write_dataset(self, data):
        """
        Persist dataset entry into HBase Table
        :param data: api that needs update
        :return: None
        """
        try:
            logging.debug("Write dataset:{%s}", data)
            table_name = self.table_name
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                dataset = {DBSCHEMA.PATH: data[DATASET.PATH], DBSCHEMA.POLICY: data[DATASET.POLICY],
                           DBSCHEMA.MODE: data[DATASET.MODE]}
                if DATASET.RETENTION in data:
                    dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION]
                logging.debug("calling put on table for %s", dataset)
                table.put(data[DATASET.ID], dataset)
        except TException as exception:
            logging.warn("Failed to write dataset into hbase,  error(%s):", exception.message)

    def delete_dataset(self, data):
        """
        Delete dataset entry from HBase.
        :param data: dataset instance
        :return: None
        """
        try:
            table_name = self.table_name
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                logging.debug("Deleting dataset from HBase:{%s}", data)
                table.delete(data['id'])
        except TException as exception:
            logging.warn("Failed to delete dataset in hbase,  error(%s):", exception.message)