Example #1
0
class Storage:
    def __init__(self, protocol: str = 'webHDFS', *args, **kwargs):
        self.protocol, self.client = protocol.lower(), None
        if protocol.lower() == 'webHDFS'.lower():
            from hdfs import InsecureClient
            self.client = InsecureClient(*args, **kwargs)
            for f in 'upload download list status delete'.split():
                setattr(self, f, getattr(self,
                                         '%s_%s' % (f, protocol.lower())))

    def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs):
        to_screen("upload %s -> %s" % (local_path, remote_path))
        return self.client.upload(local_path=local_path,
                                  hdfs_path=remote_path,
                                  **kwargs)

    def download_webhdfs(self, remote_path: str, local_path: str, **kwargs):
        mkdir_for(local_path)
        to_screen("download %s -> %s" % (remote_path, local_path))
        return self.client.download(local_path=local_path,
                                    hdfs_path=remote_path,
                                    overwrite=True,
                                    **kwargs)

    def list_webhdfs(self, remote_path: str, **kwargs):
        return self.client.list(hdfs_path=remote_path, **kwargs)

    def status_webhdfs(self, remote_path: str, **kwargs):
        return self.client.status(hdfs_path=remote_path, **kwargs)

    def delete_webhdfs(self, remote_path: str, **kwargs):
        return self.client.delete(hdfs_path=remote_path, **kwargs)
Example #2
0
def get_hdfs_max_date():
    SUCCESS_FILE = f"{PARQUET_FILE}/_SUCCESS"

    try:
        client = InsecureClient('http://namenode:9870', user='******')
        time_ts = client.status(SUCCESS_FILE)["modificationTime"] / 1000
        return date.fromtimestamp(time_ts)
    except Exception:
        log("Exception while trying to get parquet max date")
        log(traceback.format_exc())
        return DEFAULT_DATE
Example #3
0
class interHDFS:
    def __init__(self, url, user=None, **kwargs):
        self.url = url
        self.user = user
        for k, v in kwargs.items():
            self.k = v
        self.connect = InsecureClient(self.url, self.user)
        try:
            self.connect.status('/')
        except Exception as e:
            print(f"[ERROR]:")
            raise ("connected failed!")

    @property
    def apiVersion(self):
        return "v1"

    def listDir(self, dirname: str = '/'):
        return self.connect.list(dirname)

    def getFiles(self, dirname: str, depth: int = 0) -> list:
        l = []
        if not dirname:
            print("dirname is null")
        else:
            for file in self.connect.walk(dirname, depth=depth):
                if file[-1]:
                    for f in file[-1]:
                        l.append(file[0] + '/' + f)
            return l

    def downloadToCsv(self, filename: str) -> None:
        '''only split for the '€€' sign, and generate same filename in current directory'''
        with self.connect.read(filename, encoding='utf-8') as reader:
            with open(csvdir + filename.split('/')[-1].split('.')[0] + '.csv',
                      'a+') as cf:
                for line in reader.readlines():
                    newline = line.replace('€€', ',')
                    cf.write(newline)
Example #4
0
    def get(self):
        
        # Récupération du Dataset pour l'évaluation
        df = get_data_cassandra()
        
        print(df.head())
        X = df['total_estimated_load'].values

        # evaluate parameters (p,d,q)  <=> (AR, I, MA)
        p_values = 7
        d_values = 0
        q_values = 5
        #best_cfg, best_score = evaluate_models(X, p_values, d_values, q_values)
        best_cfg = (p_values,d_values,q_values)
        
        # Entrainement du meilleur modèle
        model = ARIMA(X, order=best_cfg)
        model_fit = model.fit()
        
        # save model
        if not os.path.exists(model_local_path):
               # Création du dossier d'export local qui n'existe pas
               os.makedirs(model_local_path,exist_ok=False)
        
        model_fit.save(model_local_path + model_name)
            
        # Connexion au client HDFS
        client = InsecureClient(url='http://namenode:9870', user='******')
    
        # Création du dossier de stockage des fichiers traités
        if client.status(model_hdfs_remote_path,strict=False) == None:
                client.makedirs(model_hdfs_remote_path)

	# Copie du modèle sur HDFS
        remote_load_path = client.upload(model_hdfs_remote_path, model_local_path + model_name,overwrite=True)
        #print(remote_load_path)

        print(client.list(model_hdfs_remote_path))

	
        return { 'best_cfg': best_cfg , 'status': 'Terminated'}
Example #5
0
 def get(self,period):
             
     print("Period to predict : ",period)
     
     # Connexion au client HDFS
     client = InsecureClient(url='http://namenode:9870', user='******')
     
     # Vérification de la présence du modèle sauvegardé sur HDFS
     if client.status(model_hdfs_remote_path + model_name , strict=False) != None:
         
         # load model
         client.download(model_hdfs_remote_path+model_name, model_local_path, overwrite=True)
         model_fit = ARIMAResults.load(model_local_path + model_name)
  
         # Dataset pour l'évaluation
         df = get_data_cassandra()
         print(df.head())
         X = df['total_estimated_load'].values
         
         start_index = len(X)
         end_index = start_index + int(period)
         forecast = model_fit.predict(start=start_index, end=end_index)
         
         #df['date_est_load'] = df['date_est_load'].apply(pd.Timestamp)
         day = df['date_est_load'].values[-1].date()
         print(day)
         print(type(day))
         day += datetime.timedelta(days=1)
         
         res = {}
         for yhat in forecast:
             res[day.strftime("%d/%m/%Y")] = yhat
             day += datetime.timedelta(days=1)
         
         return res
 
 
     return "Service has been stopped"
    def on_data(self, data):
        try:
            if self.count <= 10000000:
                with open(self.outfile, 'a+') as f:
                    f.write(data)
                self.count += len(data)
                return True
            else:
                hdfs_path = '/team40/stream_data/' + time.strftime(
                    '%Y-%m-%d_%H-%M', time.localtime()) + self.outfile
                client = InsecureClient('http://115.146.86.32:50070',
                                        user='******')
                client.upload(hdfs_path, self.outfile)
                print(client.status(hdfs_path, strict=False))
                self.count = 0
                with open(self.outfile, 'w') as f:
                    f.write(data)
                self.count += len(data)
                return True

        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True
Example #7
0
def generate_random_tensor_data_hdfs(
        all_tensors_config,
        cardinalities,
        tensor_name,
        zero_based_indices=False,
        hdfs_url='http://spark-master0-dsl05:50070',
        hdfs_user='******'):
    # generate tensor data on local file

    hdfs_filename = os.path.join(gctf_data_path_no_url, tensor_name + '.csv')
    print('generate_random_tensor_data_hdfs: generating %s' % hdfs_filename)

    client = InsecureClient(hdfs_url, user=hdfs_user)
    assert client.status(
        hdfs_filename, strict=False
    ) is None, 'data file %s exists can not procede' % hdfs_filename

    with client.write(hdfs_filename, encoding='utf-8') as writer:
        write_header(all_tensors_config, tensor_name, writer)
        iter_indices_gen_data(all_tensors_config, cardinalities, tensor_name,
                              writer, zero_based_indices)
        # fd.close() # TODO: hdfs client api does not specify close?

    all_tensors_config[tensor_name]['hdfs_filename'] = hdfs_filename
from datetime import date

# hdfs_path = '/projects/projectfinder/raw/items/' +\
#    date.today().year.__str__() + '/' +\
#    date.today().month.__str__() + '/'

#%% 
hdfs_path = '/projects/projectfinder/raw/items/2019'

#%%
hdfs_client.download(hdfs_path, 'hdfs_data', n_threads=5)


#%% 

hdfs_client_status = hdfs_client.status('/', strict=True)
hdfs_client_status

#%% 
hdfs_file_status = hdfs_client.list(hdfs_path)
hdfs_file_status


#%% [markdown]
# Go to [manuel](https://hdfscli.readthedocs.io/en/latest/advanced.html#path-expansion)
# ```bash
# # install hdfs using pip
# pip install hdfs
# ```

#%%
Example #9
0
class HDFSLibrary:
    """
        Test library for working with HDFS
    """
    WEB_HDFS_URL = ""
    client = ""

    def __init__(self, namenode="localhost", port="50070"):
        self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port)
        print namenode, ">>", port, ">>", self.WEB_HDFS_URL
        self.client = InsecureClient(self.WEB_HDFS_URL)

    def check_hdfs_file_exists(self, file_path, stop=False):
        if None == self.client.status(file_path, strict=False):
            if stop:
                print "ERROR: Error: File does not exist: ", file_path
                return "ERROR: Error: File does not exist: ", file_path
                # exit(172)
            return False
        return True

    def get_hdfs_file_content(self, file_path):
        self.check_hdfs_file_exists(file_path, stop=True)
        data = ""
        with self.client.read(file_path) as reader:
            for line in reader:
                data += line
        return data

    def search_string_in_hdfs_file(self,
                                   file_path,
                                   text1,
                                   text2="aqwszx",
                                   text3="xzswqa"):
        ret = self.check_hdfs_file_exists(file_path, stop=True)
        found = "" if ret else ret
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) == -1 and line.find(
                        text2) == -1 and line.find(text3) == -1:
                    continue
                found += line
        return found

    def hdfs_file_should_not_contain(self,
                                     file_path,
                                     text1,
                                     text2="aqwszx",
                                     text3="xzswqa"):
        self.check_hdfs_file_exists(file_path, stop=True)
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) != -1 or line.find(
                        text2) != -1 or line.find(text3) != -1:
                    return False
        return True

    ########################
    # # BASIC FUNCTIONS: # #
    ########################
    def get_hdfs_file_folder_content_summary(self, file_path):
        """
        Retrieving a file or folder content summary.
        :return: returns a file or folder content summary.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.content(file_path)

    def get_hdfs_file_folder_status(self, file_path):
        """
        Retrieving a file or folder status.
        :return: returns a file or folder status.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.status(file_path)

    def list_hdfs_directory(self, folder_path):
        """
        Listing all files inside a directory.
        :return: returns a file list.
        """
        self.check_hdfs_file_exists(folder_path, stop=True)
        return self.client.list(folder_path)

    def move_hdfs_file(self, old_path, new_path):
        """
        Renaming ("moving") a file.
        :return: NA
        """
        self.check_hdfs_file_exists(old_path, stop=True)
        self.client.rename(old_path, new_path)

    def delete_hdfs_file(self, file_path):
        """
        Deleting a file or folder recursively.
        :return: returns `True` if the deletion was successful otherwise `False`
        """
        self.check_hdfs_file_exists(file_path)
        return self.client.delete(file_path, recursive=True)

    def copy_to_local_hdfs_file(self, hdfs_path, local_path):
        """
        Copy a file or folder from HDFS to local.
        :return: local_path
        """
        self.check_hdfs_file_exists(hdfs_path)
        return self.client.download(hdfs_path,
                                    local_path,
                                    overwrite=True,
                                    n_threads=4)

    def copy_from_local_hdfs_file(self, local_path, hdfs_path):
        """
        Copy a file or folder from local to HDFS.
        :return: hdfs_path
        """
        return self.client.upload(hdfs_path,
                                  local_path,
                                  overwrite=True,
                                  n_threads=4)

    def get_hdfs_file_checksum(self, file_path):
        """
        Get the checksum value for file
        :return: checksum
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.checksum(file_path)

    def create_hdfs_dir(self, dir_path, perm=755):
        """
        Create a directory or recursive dirs on HDFS
        :return: NA
        """
        self.client.makedirs(dir_path, permission=perm)
Example #10
0
from hdfs import InsecureClient
import bson
import time

MAX_SIZE_FILE = 1024 * 1024 * 128  # 128mb (in bytes)

# Init HDFS
client = InsecureClient('http://X:50070', user='******')
hdfs_dir = 'tweets/'
hdfs_file = 'tweets.json'
hdfs_files_list = client.list(hdfs_dir)
if len(hdfs_files_list) > 0:
    # Get last file
    hdfs_file = sorted(hdfs_files_list, reverse=True)[0]
    hdfs_file_num = int(hdfs_file.split('.')[0])
    hdfs_file_size = client.status(hdfs_dir + hdfs_file)['length']  # in bytes
else:
    # Create file
    hdfs_file_num = 1
    hdfs_file = str(hdfs_file_num) + '.json'
    hdfs_file_size = 0  # 0 bytes
    client.write(hdfs_dir + hdfs_file, '')

# Init kafka
consumer = KafkaConsumer('X', group_id='X', bootstrap_servers='X:9092')

print time.strftime(
    "%Y-%m-%d %H:%M:%S") + ' [INFO] init KAFKA consumer and HDFS connection ok'

# New kafka message
for msg in consumer:
Example #11
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            print(self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir()))
        return File(open(local_path, mode))

    def _save(self, name, content):
        print("_save(%s, %s, %s)" % (self, name, content))
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print(hdfs_path, local_path)
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Example #12
0
class WebHDFSStore():
    '''
    A file store based on the WebHDFS protocol.
    '''
    # Set a refresh-date to indicate when we did this lookup:
    refresh_date = datetime.datetime.utcnow().isoformat(
        timespec='milliseconds') + 'Z'

    def __init__(self, service_id, user_override=None):
        self.service_id = service_id
        self.webhdfs_url = HADOOPS[service_id]['webhdfs_url']
        self.webhdfs_user = HADOOPS[service_id]['webhdfs_user']
        if user_override:
            self.webhdfs_user = user_override
        self.id_prefix = HADOOPS[service_id]['id_prefix']
        self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user)

    def put(self, local_path, hdfs_path, backup_and_replace=False):
        # Get the status of the destination:
        dest_status = self.client.status(hdfs_path, strict=False)

        # Handle files or directories:
        if os.path.isfile(local_path):
            hdfs_path = self._combine_paths(dest_status, local_path, hdfs_path)
            self._upload_file(local_path, hdfs_path, backup_and_replace)
        elif os.path.isdir(local_path):
            # TODO, if it's a directory
            raise Exception(
                "Cannot upload anything other than single files at this time!")
        else:
            raise Exception("Unknown path type! Can't handle %s" % local_path)

    def _combine_paths(self, dest_status, local_path, hdfs_path):
        # If the hdfs_path is a directory, combine the paths:
        if dest_status and dest_status['type'] == 'DIRECTORY':
            combined_path = psp.join(hdfs_path, local_path)
            logger.info("Using combined path: %s" % combined_path)
            return combined_path
        else:
            # Otherwise, just return the path:
            return hdfs_path

    def _upload_file(self, local_path, hdfs_path, backup_and_replace=False):
        """
        Copy up to HDFS, making it suitably atomic by using a temporary filename during upload.

        :return: None
        """

        # Set up flag to record outcome:
        success = False

        # Calculate hash of local file:
        logger.info("Calculating hash of %s" % local_path)
        if not os.path.isfile(local_path):
            raise Exception("Cannot upload %s - individual files only!")
        local_hash = calculate_sha512_local(local_path)
        logger.info("Local %s hash is %s " % (local_path, local_hash))

        #
        # TODO Allow upload  to overwrite truncated files?
        #

        # Check if the destination file exists:
        already_exists = self.exists(hdfs_path)
        if already_exists and not backup_and_replace:
            logger.warning(
                "Path %s already exists! No upload will be attempted." %
                hdfs_path)
        else:
            # Upload to a temporary path:
            tmp_path = "%s_temp_" % hdfs_path

            # Now upload the file, allowing overwrites as this is a temporary file and
            # simultanous updates should not be possible:
            logger.info("Uploading as %s" % tmp_path)
            with open(local_path, 'rb') as reader, self.client.write(
                    tmp_path, overwrite=True) as writer:
                while True:
                    data = reader.read(10485760)
                    if not data:
                        break
                    writer.write(data)

            # If set, backup-and-replace as needed:
            if backup_and_replace and already_exists:
                date_stamp = datetime.datetime.now().strftime(
                    '%Y-%m-%d_%H-%M-%S')
                backup_path = "%s.bkp_%s" % (hdfs_path, date_stamp)
                logger.warning("Renaming %s to %s..." %
                               (hdfs_path, backup_path))
                self.client.rename(hdfs_path, backup_path)

            # Move the uploaded file into the right place:
            logger.info("Renaming %s to %s..." % (tmp_path, hdfs_path))
            self.client.rename(tmp_path, hdfs_path)

            # Give the namenode a moment to catch-up with itself and then check it's there:
            # FIXME I suspect this is only needed for our ancient HDFS
            time.sleep(2)
            status = self.client.status(hdfs_path)

        logger.info("Calculating hash of HDFS file %s" % hdfs_path)
        hdfs_hash = self.calculate_sha512(hdfs_path)
        logger.info("HDFS %s hash is %s " % (hdfs_path, hdfs_hash))
        if local_hash != hdfs_hash:
            raise Exception("Local & HDFS hashes do not match for %s" %
                            local_path)
        else:
            logger.info("Hashes are equal!")
            success = True

        # Log successful upload:
        logger.warning("Upload completed for %s" % hdfs_path)

        # And return success flag so caller knows it worked:
        return success

    def move(self, local_path, hdfs_path):
        # Perform the PUT first:
        success = self.put(local_path, hdfs_path)
        # And delete the local file if that worked:
        if success == True:
            os.remove(local_path)

    def calculate_sha512(self, path):
        '''
        Calculate the SHA512 hash of a single file on HDFS
        '''
        with self.client.read(path) as reader:
            file_hash = calculate_reader_hash(reader, path)

        return file_hash

    def _to_info(self, path, status):
        # Add the file path:
        status['file_path'] = path
        # Classify based on HDFS storage conventions:
        item = HdfsPathParser(status).to_dict()
        # Work out the permissions string:
        if status['permission'].isnumeric():
            permissions = permissions_octal_to_string(int(
                status['permission']))
            if status['type'] == 'DIRECTORY':
                permissions = "d" + permissions
            else:
                permissions = "-" + permissions
        else:
            permissions = status['permission']
        # Defined fields based on directory/file status
        if permissions[0] == 'd':
            fs_type = 'directory'
            access_url = '%s/webhdfs/v1%s?op=LISTSTATUS&user.name=%s' % (
                self.webhdfs_url, item['file_path'], self.webhdfs_user)
        else:
            fs_type = 'file'
            access_url = '%s/webhdfs/v1%s?op=OPEN&user.name=%s' % (
                self.webhdfs_url, item['file_path'], self.webhdfs_user)
        # And return as a 'standard' dict:
        return {
            'id': '%s%s' % (self.id_prefix, item['file_path']),
            'refresh_date_dt': self.refresh_date,
            'file_path_s': item['file_path'],
            'file_size_l': item['file_size'],
            'file_ext_s': item['file_ext'],
            'file_name_s': item['file_name'],
            'permissions_s': permissions,
            'hdfs_replicas_i': item['number_of_replicas'],
            'hdfs_user_s': item['user_id'],
            'hdfs_group_s': item['group_id'],
            'modified_at_dt': "%sZ" % item['modified_at'],
            'timestamp_dt': "%sZ" % item['timestamp'],
            'year_i': item['timestamp'][0:4],
            'recognised_b': item['recognised'],
            'kind_s': item['kind'],
            'collection_s': item['collection'],
            'stream_s': item['stream'],
            'job_s': item['job'],
            'layout_s': item['layout'],
            'hdfs_service_id_s': self.service_id,
            'hdfs_type_s': fs_type,
            'access_url_s': access_url
        }

    def list(self, path, recursive=False):
        # Handle non-existant entry, or a file:
        path_status = self.client.status(path, strict=False)
        if path_status is None:
            raise Exception("No such file or directory: %s" % path)
        elif path_status['type'] == 'FILE':
            # Plain old file:
            yield self._to_info(path, path_status)
        else:
            # Handle folders:
            if recursive:
                for dir_info, dirs_info, files_info in self.client.walk(
                        path, status=True):
                    dir_path, dir_status = dir_info
                    for file_name, file_status in files_info:
                        file_path = psp.join(dir_path, file_name)
                        yield self._to_info(file_path, file_status)
            else:
                for file_name, file_status in self.client.list(path,
                                                               status=True):
                    file_path = psp.join(path, file_name)
                    yield self._to_info(file_path, file_status)

    def exists(self, path):
        status = self.client.status(path, strict=False)
        if status:
            return True
        else:
            return False

    def rm(self, path):
        # And delete from HDFS (usually prevented by API proxy)
        # Hard-coded to never act recursively - if you want that, do it manually via the back-end.
        self.client.delete(path, recursive=False)

    def stream(self, path, offset=0, length=None):
        # NOTE our WebHDFS service is very old and uses 'len' not 'length' for controlling the response length:
        # The API proxy we use attempts to remedy this by mapping any 'length' parameter to 'len'.
        return self.client.read(path, offset=offset, length=length)

    def read(self, path, offset=0, length=None):
        with self.stream(path, offset, length) as reader:
            while True:
                data = reader.read(10485760)
                if not data:
                    break
                yield data

    def lsr_to_items(self, reader):
        """
        This task processes a raw list of files generated by the hadoop fs -lsr command.

        As this can be a very large list, it avoids reading it all into memory. It
        parses each line, and yields a suitable stream of parsed objects matching the WebHDFS API.
        """
        for line in reader:
            if "lsr: DEPRECATED: Please use 'ls -R' instead." in line:
                logger.warning(line)
            else:
                permissions, number_of_replicas, userid, groupid, filesize, modification_date, modification_time, filename = line.split(
                    None, 7)
                filename = filename.strip()
                timestamp = datetime.datetime.strptime(
                    '%s %s' % (modification_date, modification_time),
                    '%Y-%m-%d %H:%M')
                info = {
                    'permission': permissions,
                    'replication': number_of_replicas,
                    'owner': userid,
                    'group': groupid,
                    'length': filesize,
                    'modificationTime': timestamp.timestamp() * 1000,
                    'pathSuffix': filename
                }
                # Skip directories:
                if permissions[0] != 'd':
                    yield self._to_info(filename, info)
                    info['type'] = 'DIRECTORY'
                else:
                    info['type'] = 'FILE'
Example #13
0
 hive_template = 'hive://hive的IP:hive的端口号/{db}'.format(db=db)
 hive_path = '/user/hive/warehouse/{db}.db/{table}/ds={date}/{filename}'.format(
     db=db, table=table, filename=filename, date=date)
 local_path = '/home/data/superhero/redis_stats/info_{date}'.format(
     date=date)
 hdfs_path = '/tmp/{db}/{filename}'.format(**{
     'filename': filename,
     'db': db
 })
 try:
     hdfs_client = InsecureClient(hdfs_url)
     engine = sqlalchemy.create_engine(hive_template)
     conn_hive = engine.raw_connection()
     cur = conn_hive.cursor()
     # 检测文件是否存在
     if hdfs_client.status(hive_path, strict=False):
         # 输出文件大小
         print hdfs_client.status(hive_path,
                                  strict=False).get('length', '0')
         print 'Data In Hive!'
     else:
         print 'Warning: Data Not In Hive!'.format(table)
     # 删除hive文件
     try:
         hdfs_client.delete(hive_path)
         print '{hive_path} Delete Complete'.format(hive_path=hive_path)
     except Exception, e:
         print e
         print '{hive_path} Delete Faild'.format(hive_path=hive_path)
     # 上传本地文件到Hive
     if os.path.exists(local_path):
Example #14
0
class Uploader():
    """
    Initialise and set-up the HDFS connection:
    """
    def __init__(self, hadoop_url, hadoop_user):
        # Set up client:
        self.hdfsClient = InsecureClient(hadoop_url, user=hadoop_user)

    def write_hash_file(self, path, hash, on_hdfs=False):
        if on_hdfs:
            raise Exception("Writing hash to HDFS not supported yet.")
        else:
            hash_path = "%s.sha512" % path
            if (os.path.exists(hash_path)):
                logger.warning("Hash file %s already exists." % hash_path)
            else:
                with open(hash_path, 'w') as hash_file:
                    hash_file.write("%s\n" % hash)

    def safe_upload(self, localFile, hdfsFile, removeLocal=True):
        """
        This performs a safe upload - it will never overwrite a file on HDFS, and it uses checksums to verify the transfer.

        :param localFile:
        :param hdfsFile:
        :return:
        """

        # get local file hash and size
        localHash = get_checksum(localFile)
        self.write_hash_file(localFile, localHash)
        localSize = os.path.getsize(localFile)
        localModtime = datetime.fromtimestamp(os.path.getmtime(localFile))

        # store checksum as a local file:

        # upload file to HDFS if not already existing
        hdfsFileStatus = self.hdfsClient.status(hdfsFile, strict=False)
        if hdfsFileStatus == None:
            logger.info('---- ----')
            logger.info("Copying %s to HDFS %s" % (localFile, hdfsFile))
            logger.info("localFile size %i hash %s date %s" %
                        (localSize, localHash, localModtime))
            with open(localFile, 'r') as f:
                self.hdfsClient.write(data=f,
                                      hdfs_path=hdfsFile,
                                      overwrite=False)
            time.sleep(1)
            hdfsFileStatus = self.hdfsClient.status(hdfsFile, strict=False)

        # test if local and HDFS same
        if localSize != hdfsFileStatus['length']:
            logger.error(
                "hdfsFile %s size differs %i, %s size %i" %
                (hdfsFile, hdfsFileStatus['length'], localFile, localSize))

        else:
            hdfsHash = get_checksum(hdfsFile,
                                    on_hdfs=True,
                                    hdfsClient=self.hdfsClient)
            if localHash != hdfsHash:
                logger.debug("hdfsFile %s hash differs %s, %s hash %s" %
                             (hdfsFile, hdfsHash, localFile, localHash))

            else:
                # if uploaded HDFS file hash same as local file hash, delete local file
                logger.info("hdfsFile size %i hash %s" %
                            (hdfsFileStatus['length'], hdfsHash))
                logger.info("Deleting %s" % localFile)
                os.remove(localFile)
        time.sleep(1)
Example #15
0
class HaHadoopConnector:
    def __init__(self, logger, hdfsHosts, user):
        self.logger = logger
        self._hdfsHosts = hdfsHosts
        self._user = user
        self._hdfsCli = None
        self._connHdfsInfo = None
        self._maxRetry = 10
        self._lock = Lock()

    def close(self):
        self._hdfsCli = None
        self._connHdfsInfo = None

    def _printDebug(self, message):
        if self.logger:
            self.logger.debug(message)
        else:
            print message

    def _printError(self, errorMsg):
        if self.logger:
            self.logger.warn(errorMsg)
        else:
            print "[warn] %s" % (errorMsg)

    def _printException(self, exception):
        if self.logger:
            self.logger.exception(exception)
        else:
            print traceback.format_exc(exception)

    def _extractSafeTime(self, message):
        splitedErrorMsg = message.split(' ')
        safeTime = splitedErrorMsg[len(splitedErrorMsg) - 2]
        if safeTime.isdigit():
            return True, int(safeTime) + 5
        else:
            return False, None

    def _setConnection(self):
        if self._connHdfsInfo and self._hdfsCli:
            return self._hdfsCli

        self._lock.acquire()
        for hdfsHost in self._hdfsHosts:
            try:
                self._hdfsCli = InsecureClient(hdfsHost, user=self._user)
                self._hdfsCli.status('/')
                self._connHdfsInfo = hdfsHost
                debugMsg = "connected hdfs : %s" % hdfsHost
                if self.logger:
                    self.logger.debug(debugMsg)
                break
            except HdfsError, e:
                self.close()
                errorMsg = "hdfs error : %s, %s" % (str(e), hdfsHost)
                self._printError(errorMsg)
            except ConnectionError, e:
                self.close()
                errorMsg = "connection error : %s, %s" % (str(e), hdfsHost)
                time.sleep(1)
                self._printError(errorMsg)
            except Exception, e:
                self.close()
                errorMsg = "connection error : %s" % (hdfsHost)
                self._printError(errorMsg)
                self._printException(e)
                if self._lock:
                    self._lock.release()
                raise Exception
def my_dag_function():
  # хост со статически определённым IP
  hostname = '34.76.18.152'
  # порт ElasticSearch
  elk_port = 9200
  # порт HDFS
  hdfs_port = 50070

  # открываем соединение к ElasticSearch и получаем данные из индекса (забираем всё то, что нападало за последние 2 минуты).
  es = elasticsearch.Elasticsearch([hostname+":"+str(elk_port)])
  res = es.search(index="dmitriy.voronko", body = {"query" : {"range" : {"@timestamp": {"gte" : "now-2m", "lt" : "now"}}}}, size = 500)

  # получаем название директории текущего дня обработки данных.
  curr_dir_name = (datetime.now()).strftime("%Y%m%d")
  print("Directory for current date: " + curr_dir_name)

  # Открываем соединение с HDFS и проверяем есть ли там файл в той директории, куда мы будем писать данные. Если его нету, то создаём файл, чтобы запись в режиме append работала
  # без ошибок.
  client_hdfs = InsecureClient("http://"+hostname+":"+str(hdfs_port), user="******")
  try:
    status = client_hdfs.status("/tmp/"+curr_dir_name+"/"+curr_dir_name+".json")
  except:
    client_hdfs.write("/tmp/"+curr_dir_name+"/"+curr_dir_name+".json", append=False, encoding="utf-8", data="")

  for doc in res['hits']['hits']:
    client_hdfs.write("/tmp/"+curr_dir_name+"/"+curr_dir_name+".json", encoding="utf-8", append=True, data=json.dumps(doc['_source']))

  # пишем данные в ClickHouse.
  # возможно не самый оптимальный вариант, связанный с тем, что каждое сообщение из ElasticSearch индекса раскладывается на переменные, из которых потом формируется
  # tuple, вставляемый в таблицу в ClickHouse.
  client = Client('localhost', port=9011)

  for doc in res['hits']['hits']:
      data = json.loads(doc['_source']['message'])
      timestamp_v = data['timestamp']
      referer_v = data['referer']
      location_v = data['location']
      remoteHost_v = data['remoteHost']
      partyId_v = data['partyId']
      sessionId_v = data['sessionId']
      pageViewId_v = data['pageViewId']
      eventType_v = data['eventType']
      item_id_v = data['item_id']
      item_price_v = int(data['item_price'])
      item_url_v = data['item_url']
      basket_price_v = None
      if data['basket_price'] != '':
          basket_price_v = data['basket_price']
      detectedDuplicate_ = 0
      if data['detectedDuplicate'] == 'true':
          detectedDuplicate_v = 1
      else:
        detectedDuplicate_v = 0
      detectedCorruption_v = 0
      if data['detectedCorruption'] == 'true':
          detectedCorruption_v = 1
      else:
        detectedCorruption_v = 0
      firstInSession_v = 0
      if data['firstInSession'] == 'true':
          firstInSession_v = 1
      else:
      else:
        firstInSession_v = 0
      userAgentName_v = data['userAgentName']
      client.execute('INSERT INTO lab1db.lab1_messages (timestamp, referer, location, remoteHost, partyId, sessionId, pageViewId, eventType, item_id, item_price, item_url, basket_price, detectedDuplicate, detectedCorruption,  firstInSession, userAgentName) VALUES', [(timestamp_v, referer_v, location_v, remoteHost_v, partyId_v, sessionId_v, pageViewId_v, eventType_v, item_id_v, item_price_v, item_url_v,  basket_price_v, detectedDuplicate_v, detectedCorruption_v, firstInSession_v, userAgentName_v)])
Example #17
0
class HisiHdfs:
    def __init__(self):
        self._c = InsecureClient(url="http://{}:14000".format(
            HisiHdfs.get_host()),
                                 user='******',
                                 root="/")
        # self._c = InsecureClient(url="http://10.154.67.254:14000", user='******', root="/")

    @staticmethod
    def get_host():
        domain = 'hdfs-ngx1.turing-ci.hisilicon.com'
        try:
            socket.gethostbyname(domain)
            return domain
        except Exception as e:
            return '10.154.67.254'

    @staticmethod
    def build_month_path(build_scene):
        '''daily build path'''
        return '/compilepackage/CI_Version/{}/br_hisi_trunk_ai/{}'.\
            format(build_scene, datetime.datetime.today().strftime('%Y%m'))

    @staticmethod
    def prebuild_month_path(build_scene):
        '''compile path'''
        return '/compilepackage/CI_Version/{}/br_hisi_trunk_ai_PRE_COMPILE/{}'.\
            format(build_scene, datetime.datetime.today().strftime('%Y%m'))

    def find_newest_build(self, build_scene):
        builds = self._c.list(HisiHdfs.build_month_path(build_scene), True)
        newest_build_name = None
        for build in builds:
            if type(build) != tuple:
                logging.warning("Unexpected build format {}".format(build))
                continue
            if len(build) < 2:
                logging.warning("Unexpected build format {}".format(build))
                continue
            if type(build[1]) != dict:
                logging.warning("Unexpected build format[1] {}".format(build))
                continue
            if build[1].get('type', None) != "DIRECTORY":
                logging.warning(
                    "Found unexpected build type(not DIRECTORY) {}".format(
                        build))
                continue
            if type(build[0]) != str:
                logging.warning("Unexpected build format[0] {}".format(build))
                continue
            elements = build[0].split('_')
            if len(elements) != 3:
                logging.warning("Unexpected build name {}".format(build))
                continue
            if elements[2] != "newest":
                continue
            # build_date = datetime.datetime.strptime('_'.join(elements[:2]), "%Y%m%d_%H%M%S%f")
            if newest_build_name is None:
                newest_build_name = build[0]
                continue
            if newest_build_name < build[0]:
                newest_build_name = build[0]
        return newest_build_name

    def path_exists(self, base_path: str, build_name: str):
        path = "{}/{}".format(base_path, build_name)
        return self._c.status(path, strict=False) is not None

    def find_package(self,
                     base_path: str,
                     build_name: str,
                     package_type: PackageType,
                     os_type=None,
                     arch=None):
        if os_type is None:
            os_type, arch = get_env()
        path = "{}/{}".format(base_path, build_name)
        packages = self._c.list(path, True)
        pr = package_type.get_name_re()
        for package_name, package_info in packages:
            pm = pr.match(package_name)
            if pm is not None:
                if OsType.analyse_os(pm.group('os')) == os_type and pm.group(
                        'arch') == arch:
                    return package_name
        return None

    def download_package(self, base_path: str, build_name: str,
                         package_name: str, local_path: str):
        return self._c.download(hdfs_path="{}/{}/{}".format(
            base_path, build_name, package_name),
                                local_path=local_path,
                                overwrite=True)

    def download_compile_package(self, build_scene: str, build_name: str,
                                 package_name: str, local_path: str):
        return self.download_package(HisiHdfs.prebuild_month_path(build_scene),
                                     build_name, package_name, local_path)

    def download_daily_package(self, build_scene: str, build_name: str,
                               package_name: str, local_path: str):
        return self.download_package(HisiHdfs.build_month_path(build_scene),
                                     build_name, package_name, local_path)

    def download_newest(self,
                        local_path: str,
                        packages: List[PackageType],
                        os_type=None,
                        arch=None):
        if not os.path.isdir(local_path):
            raise FileNotFoundError(
                "The path {} does not exists".format(local_path))
        if os_type is None:
            os_type, arch = get_env()

        build_scenes_to_build_name = {}
        package_names = []
        print("Begin to download newest run packages from the newest")
        for package in packages:
            build_scene = package.get_build_scene()
            newest_build_name = build_scenes_to_build_name.get(
                build_scene, self.find_newest_build(build_scene))
            if newest_build_name is None:
                logging.error("Can not find the newest build")
                raise Exception("Can not find the newest build")
            package_name = self.find_package(
                HisiHdfs.build_month_path(build_scene), newest_build_name,
                package, os_type, arch)
            if package_name is None:
                logging.error(
                    "Can not find the package {}, os {}, arch {}".format(
                        package, os_type, arch))
                raise Exception("Can not find package")
            with shell_printer.DotPrinter(
                    "Begin to download {} from {} to {}".format(
                        package_name, newest_build_name, local_path)):
                self.download_daily_package(build_scene, newest_build_name,
                                            package_name, local_path)
            logging.info("Download {} to {} successfully".format(
                package_name, local_path))
            package_names.append(package_name)
        return package_names

    def download_compile_packages(self, build_name: str, local_path: str,
                                  package_types: List[PackageType]):
        self.wait_compile_paths_ready(package_types, build_name)
        package_names = []
        for package_type in package_types:
            package_name = self.find_package(
                HisiHdfs.prebuild_month_path(package_type.get_build_scene()),
                build_name, package_type)
            if package_name is None:
                with shell_printer.DotPrinter("Wait package {} from {}".format(
                        package_type.name, build_name)):
                    while package_name is None:
                        logging.debug(
                            "Can not find package {} from {}, sleep".format(
                                package_type.name, build_name))
                        time.sleep(10)
                        package_name = self.find_package(
                            HisiHdfs.prebuild_month_path(
                                package_type.get_build_scene()), build_name,
                            package_type)
                    # 实测来看,刚创建好的文件直接下载可能有问题(下载失败,或者下载文件不完整),这里等5秒钟再下载
                    time.sleep(5)

            with shell_printer.DotPrinter("Begin to download {} to {}".format(
                    package_name, local_path)):
                self.download_compile_package(package_type.get_build_scene(),
                                              build_name, package_name,
                                              local_path)
            logging.info("Download {} to {} successfully".format(
                package_name, local_path))
            package_names.append(package_name)
        return package_names

    def wait_compile_paths_ready(self, package_types: List[PackageType],
                                 build_name: str):
        scenes = set([pt.get_build_scene() for pt in package_types])
        for build_scene in scenes:
            build_path = HisiHdfs.prebuild_month_path(build_scene)
            if not self.path_exists(build_path, build_name):
                with shell_printer.DotPrinter(
                        "The build({}) path({}) has not been created, wait".
                        format(build_name, build_path)):
                    while not self.path_exists(build_path, build_name):
                        time.sleep(1)
Example #18
0
class HdfsFile:
    '''HDFS File Object

    Keyword arguments:
        path -- HDFS file path
        mode -- one of ['r', 'rb', 'w', 'wb', 'a', 'ab'], 
                all else will be seen as 'w'
                (default 'r')
        encoding -- should be specified if not in binary mode 
                    (default 'utf-8')
    '''
    def __init__(self,
                 path: str,
                 mode: str = 'r',
                 encoding: str = 'utf-8',
                 host: str = HDFS_HOST,
                 port: int = HDFS_PORT,
                 user: str = HDFS_USER):
        self.client = InsecureClient(url=f'http://{host}:{port}', user=user)

        self.path = path
        self.name = path.split('\\')[-1]
        self.mode = mode
        self.encoding = encoding

        if self.mode[0] == 'r':
            self.__cache_content()
            self.fptr = 0
        elif self.mode[0] == 'w':
            self.content = self.__binary_helper('')
            self.fptr = 0
        elif self.mode[0] == 'a':
            self.__cache_content()
            self.fptr = len(self.content)
        else:
            raise UnsupportedMode(f'unsupported mode {self.mode}')

    def __binary_helper(self, content):
        if len(self.mode) > 1 and self.mode[1] == 'b':
            if isinstance(content, str):
                return content.encode(self.encoding)
        else:
            if isinstance(content, bytes):
                return content.decode(self.encoding)
        return content

    def __cache_content(self):
        if not self.exists():
            raise FileNotFound()

        with self.client.read(self.path) as reader:
            self.content = self.__binary_helper(reader.read())

    # iterable compatible

    def __iter__(self):
        return self

    # iterator compatible
    def __next__(self):
        buffer = self.readline()
        if buffer == self.__binary_helper(''):
            raise StopIteration()
        return buffer

    # for with ... as ... use
    def __enter__(self):
        return self

    # for with ... as ... use
    def __exit__(self, type, value, traceback):
        self.flush()

    def exists(self) -> bool:
        if self.client.status(hdfs_path=self.path, strict=False) is None:
            return False
        return True

    def read(self, size: int = None) -> str or bytes:
        if self.mode[0] != 'r':
            raise UnsupportedOperation(f'{self.mode} does not support read')

        if size is None or size < 0:
            offset = len(self.content) - self.fptr
        else:
            offset = size
        buffer = self.content[self.fptr:self.fptr + offset]
        self.fptr += offset
        return buffer

    def readline(self, size: int = None) -> str or bytes:
        if self.mode[0] != 'r':
            raise UnsupportedOperation(f'{self.mode} does not support read')

        offset = 0
        while self.fptr + offset < len(
                self.content) and self.content[self.fptr +
                                               offset] not in [10, '\n']:
            offset += 1
        offset += 1
        buffer = self.content[self.fptr:self.fptr + offset]
        self.fptr += offset
        return buffer

    def seek(self, cookie: int):
        if not isinstance(cookie, int) or cookie < 0:
            raise InvalidParameterValue(
                f'cookie must be a non-negative integer')

        self.fptr = cookie

    def write(self, text: str or bytes) -> int:
        if self.mode[0] not in ['w', 'a']:
            raise UnsupportedOperation(f'{self.mode} does not support write')

        self.content += self.__binary_helper(text)
        return len(text)

    def flush(self):
        if self.mode[0] in ['w', 'a']:
            if not self.exists():
                self.client.write(hdfs_path=self.path, data=self.content)
            else:
                self.client.write(hdfs_path=self.path,
                                  data=self.content,
                                  overwrite=True)

    def close(self):
        self.flush()
Example #19
0
class HadoopFileSystem(object):
    def __init__(self, *opts):
        self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER'])
         
#     def make_tree(self, datasourceid, client, path):
#         tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[])
#         try: lst = client.list(path, status=True)
#         except:
#             pass #ignore errors
#         else:
#             for fsitem in lst:
#                 fn = os.path.join(path, fsitem[0])
#                 if fsitem[1]['type'] == "DIRECTORY":
#                     tree['children'].append(make_hdfs_tree(datasourceid, client, fn))
#                 else:
#                     tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []})
#         return tree

    def make_json(self, datasourceid, base, relative_path):
        path = os.path.join(base, relative_path)
        data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) }
        status = self.client.status(path, False)

        if status is not None:
            if status['type'] == "DIRECTORY":
                data_json['type'] = DataType.Folder
                data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)]
            else:
                data_json['type'] = DataType.File
        #print(json.dumps(data_json))
        return data_json
    
    def makedirs(self, path):
        try: 
            self.client.makedirs(path)
        except:
            return None
        return path
    
    def delete(self, path):
        try: 
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e: print(e)
        
    def addfolder(self, path):
        i = 0
        while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None:
            i += 1
        return self.makedirs(os.path.join(path, "New Folder ({0})".format(i)))
    
    def rename(self, oldpath, newpath):
        try:
            self.client.rename(oldpath, newpath)
        except Exception as e: print(e)
    
    def saveUpload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            self.client.upload(os.path.dirname(fullpath), localpath, True)
        except:
            pass
        
    def download(self, fullpath):
        status = self.client.status(fullpath, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
            return self.client.download(fullpath, localpath, True)
        else:
            return None
Example #20
0
class HDFSStorage(Storage):
    """
    HDFS storage
    """

    def fix_slashes(self, path):
        sep = os.path.sep
        if path[0] != sep:
            path = sep + path
        if path[-1] != sep:
            path = path + sep
        return path

    def __init__(self, location=None, base_url=None):
        self.hdfs_hosts = settings.HDFS_STORAGE['hosts']
        self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root'])
        self.media_root = settings.MEDIA_ROOT
        self.media_url = self.fix_slashes(settings.MEDIA_URL)

        self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root)
        self.client = InsecureClient(self.hdfs_hosts)

    def _open(self, name, mode='rb'):
        local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep))
        if not os.path.exists(local_path):
            remote_path = self.path(name)
            local_dir = os.path.dirname(local_path)
            if not os.path.exists(local_dir):
                os.mkdir(local_dir)
            print self.client.download(remote_path, local_path=local_path, overwrite=True,
                                       temp_dir=tempfile.gettempdir())
        return File(open(local_path, mode))

    def _save(self, name, content):
        print "_save(%s, %s, %s)" % (self, name, content)
        local_path = content.name
        hdfs_path = self.path(name)  # os.path.basename(local_path))
        print hdfs_path, local_path
        self.client.write(hdfs_path, data=content, overwrite=True)
        return name

    def url(self, name):
        return self.fetch_url % name

    def delete(self, name):
        return self.client.delete(self.path(name))

    def listdir(self, path):
        file_list = []
        dir_list = []
        for name, status in self.client.list(self.path(path), status=True):
            if status['type'] == 'DIRECTORY':
                dir_list.append(name)
            elif status['type'] == 'FILE':
                file_list.append(name)
        return dir_list, file_list

    def size(self, name):
        return self.client.status(self.path(name))['length']

    def exists(self, name):
        try:
            return True if self.client.status(self.path(name)) else False
        except HdfsError:
            return False

    def path(self, name):
        return (self.hdfs_root + name).replace('\\', '/')
Example #21
0
class HadoopFileSystem():
    def __init__(self, url, user):
        u = urlsplit(url)
        if u.scheme != 'http' and u.scheme != 'https':
            raise ValueError("Invalid name node address")

        self.url = urlunparse((u.scheme, u.netloc, '', '', '', ''))
        self.client = InsecureClient(self.url, user=user)
        self.localdir = u.path
        self.prefix = 'HDFS'

    def normalize_path(self, path):
        path = os.path.normpath(path)
        path = self.strip_prefix(path)
        while path and path[0] == os.sep:
            path = path[1:]
        return os.path.join(self.localdir, path)

    def strip_prefix(self, path):
        return path[len(self.prefix):] if path.startswith(
            self.prefix) else path

    def strip_root(self, path):
        path = self.strip_prefix(path)
        if path.startswith(self.url):
            path = path[len(self.url):]
            if not path.startswith(self.localdir):
                raise 'Invalid hdfs path. It must start with the root directory'
        return path[len(self.localdir):] if path.startswith(
            self.localdir) else path

    def create_folder(self, path):
        try:
            path = self.normalize_path(path)
            self.client.makedirs(path)
        except:
            return None
        return path

    def remove(self, path):
        try:
            path = self.normalize_path(path)
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e:
            print(e)

    def rename(self, oldpath, newpath):
        try:
            oldpath = self.normalize_path(oldpath)
            newpath = self.normalize_path(newpath)
            self.client.rename(oldpath, newpath)
        except Exception as e:
            print(e)

    def get_files(self, path):
        path = self.normalize_path(path)
        files = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] != "DIRECTORY":
                files.append(f)
        return files

    def get_folders(self, path):
        path = self.normalize_path(path)
        folders = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] == "DIRECTORY":
                folders.append(f)
        return folders

    def exists(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return not (status is None)

    def isdir(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "DIRECTORY"

    def isfile(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "FILE"

    def read(self, path):
        path = self.normalize_path(path)
        with self.client.read(path) as reader:
            return reader.read().decode('utf-8')

    def write(self, path, content):
        path = self.normalize_path(path)
        self.client.write(path, content)

    def make_json(self, path):
        normalized_path = self.normalize_path(path)
        data_json = {
            'path': urljoin(self.url, normalized_path),
            'text': os.path.basename(path)
        }
        status = self.client.status(normalized_path, False)

        if status is not None:
            data_json['folder'] = status['type'] == "DIRECTORY"
            if status['type'] == "DIRECTORY":
                data_json['nodes'] = [
                    self.make_json(os.path.join(path, fn))
                    for fn in self.client.list(normalized_path)
                ]
        #print(json.dumps(data_json))
        return data_json

    def save_upload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(),
                                 os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            if isfile(fullpath):
                fullpath = os.path.dirname(fullpath)
            self.client.upload(self.normalize_path(fullpath), localpath, True)
        except:
            pass

    def download(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(),
                                     os.path.basename(path))
            return self.client.download(path, localpath, True)
        else:
            return None
def start_hdfs_streaming():
    while True:

        #---------------------- connection to MongoDB client ----------------------------
        client = MongoClient()
        db = client['DB_filtereddata']
        collection_edits = db['Edit_filtered_collection']
        collection_others = db['other_filtered_collection']

        #-------------------- loading data into pandas dataframe ------------------------
        col_edits = collection_edits.find()
        df_edits = pd.DataFrame(col_edits)

        col_others = collection_others.find()
        df_others = pd.DataFrame(col_others)

        df = df_edits.append(df_others, ignore_index=True)

        if df.empty is False:
            df = df.drop_duplicates()
            print(df.count())
            print(df.head())

            #---------------------- connection to HDFS localhost ----------------------------
            client_hdfs = InsecureClient('http://localhost:50070/',
                                         user="******")
            print(client_hdfs)
            filename = 'Filtereddata.csv'

            #------------------- writing data into HDFS from dataframe ----------------------
            exist = client_hdfs.status(filename, strict=False)
            print(f'{filename} existing in: {exist}')
            try:
                if exist == None:
                    with client_hdfs.write(filename,
                                           encoding='utf-8',
                                           overwrite=True) as writer:
                        print(writer)
                        df.to_csv(writer)
                        print(f'Data saved in {filename} in {client_hdfs}')
                else:
                    with client_hdfs.write(filename,
                                           encoding='utf-8',
                                           append=True) as writer:
                        print(writer)
                        df.to_csv(writer)
                        print(
                            f'Data appended to existing file {filename} in {client_hdfs}'
                        )
            except ValueError:
                pass

            #------------------- creating backup file into local storage ----------------------
            records = df_edits.to_dict(orient='records')
            now = datetime.now()
            current_time = now.strftime("%H_%M")
            jsonpath = collection_edits.name + '_' + current_time + ".json"
            jsonpath = join("C:/Backup_Mongo/filtereddata", jsonpath)
            with open(jsonpath, 'w') as jsonfile:
                jsonfile.write(dumps(records))
            print(f'Backup stored in {jsonpath}')

            records = df_others.to_dict(orient='records')
            jsonpath = collection_others.name + '_' + current_time + ".json"
            jsonpath = join("C:/Backup_Mongo/filtereddata", jsonpath)
            with open(jsonpath, 'w') as jsonfile:
                jsonfile.write(dumps(records))
            print(f'Backup stored in {jsonpath}')

            #---------------- deleting HDFS stored records from the MongoDB ------------------
            records = df_edits.to_dict(orient='records')
            record_ids = [record['_id'] for record in records]
            collection_edits.delete_many({'_id': {'$in': record_ids}})
            print('stored edit records in HDFS and deleted from MongoDB')

            records = df_others.to_dict(orient='records')
            record_ids = [record['_id'] for record in records]
            collection_others.delete_many({'_id': {'$in': record_ids}})
            print('stored other records in HDFS and deleted from MongoDB')

        else:
            print(f"no records found in the {collection_edits}")

        time.sleep(60)
Example #23
0
class HDFS(BaseRepository):
    def __init__(self, host: str, port, user: str):
        super().__init__()
        self.host = host
        self.port = port
        self.user = user
        self.prodcuer = None

    def connect(self):
        self.conn = InsecureClient(f"http://{self.host}:{self.port}",
                                   user=self.user)
        if os.environ.get("KAFKA_BOOTSTRAP", None):
            self.producer = KafkaProducer(bootstrap_servers=os.environ.get(
                "KAFAKA_BOOTSTRAP", "localhost:1234"))
        else:
            self.producer = None

    def disconnect(self):
        self.save_snapshot()
        if self.prodcuer:
            self.producer.close()

    def insert_rows(self, rows: list[(datetime, str, str, str, str, str)]):
        self.add_buff(rows)
        self.flush()

    def _last_datetime(self, category, date):
        if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0:
            return config.min_date
        tfname = ''
        with tempfile.NamedTemporaryFile("wb") as tf:
            tfname = tf.name
            with self.conn.read(f"/krwordcloud/add-article/{date}",
                                chunk_size=8096) as hf:
                for chunk in hf:
                    tf.write(chunk)
            with open(tfname, 'rb') as tf:
                reader = pyorc.Reader(tf)
                maximum = datetime.datetime \
                    .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z")
                for row in reader:
                    if row[0] > maximum and row[1] == category:
                        maximum = row[0]
                if (maximum < config.min_date):
                    return config.min_date
                elif maximum > datetime.datetime.now().replace(tzinfo=KST):
                    return datetime.datetime.now().replace(tzinfo=KST)
                else:
                    return maximum
        os.unlink(tfname)

    def make_entries(self):
        entries = dict()
        hdfs_entries = dict()
        lookup_hdfs = []

        self.load_snapshot()

        for category in config.categories:
            category_rows = list(
                filter(lambda row: row[1] == category, self.buff))
            if len(category_rows) > 0:
                last = max(category_rows, key=lambda row: row[0])
                entries[category] = last[0]
            else:
                lookup_hdfs.append(category)

        try:
            dates = self.conn.list("/krwordcloud/add-article/")
            if len(dates) > 0:
                for category in lookup_hdfs:
                    found = False
                    for last in reversed(dates):
                        try:
                            entries[category] = self._last_datetime(
                                category, last)
                            found = True
                            break
                        except Exception as e:
                            print(e)
                            continue
                    if found is False:
                        entries[category] = config.min_date
            else:
                hdfs_entries = dict.fromkeys(lookup_hdfs, config.min_date)
        except HdfsError:
            entries[category] = config.min_date
        except Exception as e:
            print(e)
        return {
            k: v
            for k, v in sorted({
                **entries,
                **hdfs_entries
            }.items(),
                               key=lambda item: item[1])
        }

    def save_snapshot(self):
        print('save_snapshot')
        with self.conn.write("/krwordcloud/snapshot.json",
                             overwrite=True,
                             encoding="utf-8") as f:
            data = list(
                map(lambda x: (x[0].isoformat(), x[1], x[2], x[3], x[4], x[5]),
                    self.buff))
            json.dump(data, f, ensure_ascii=False)

    def load_snapshot(self):
        print('load_snapshot')
        try:
            with self.conn.read("/krwordcloud/snapshot.json",
                                encoding="utf-8") as f:
                self.buff = list(
                    map(
                        lambda x:
                        (parser.parse(x[0]), x[1], x[2], x[3], x[4], x[5]),
                        json.load(f)))
        except Exception:
            self.buff = []

    def flush(self):
        dates = sorted(list(set(map(lambda row: row[0].date(), self.buff))))
        if len(dates) > 1:
            for d in dates[:-1]:
                data = list(filter(lambda row: row[0].date() == d, self.buff))
                if self.producer:
                    self._kafka_flush(d, data)
                else:
                    self._hdfs_flush(d, data)
            self.buff = list(
                filter(lambda row: row[0].date() == dates[-1], self.buff))
            self.save_snapshot()

    def _kafka_flush(self, date, data):
        self.producer.send(f"add-article-{date}", data)

    def _hdfs_flush(self, date, data):
        with self.conn.write(f"/krwordcloud/add-article/{date}.orc",
                             overwrite=True) as hf:
            tfname = ''
            with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf:
                tfname = tf.name
                with pyorc.Writer(
                        tf,
                        schema="struct<field0:timestamp,field1:string," +
                        "field2:string,field3:string>",
                ) as of:
                    of.writerows(data)
            with open(tfname, 'rb') as tf:
                for line in tf:
                    hf.write(line)
            os.unlink(tfname)
Example #24
0
class HDFSWrapper(object):
    def __init__(self):
        self.__m_HDFS_Handler__ = None
        self.__m_HDFS_WebFSDir__ = None
        self.__m_HDFS_User__ = None
        self.__m_HDFS_WebFSURL__ = None

    def HDFS_makedirs(self, hdfs_path):
        """ 创建目录 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")
        self.__m_HDFS_Handler__.makedirs(
            os.path.join(self.__m_HDFS_WebFSDir__,
                         hdfs_path).replace('\\', '/'))

    def HDFS_setPermission(self, hdfs_path, permission):
        """ 修改指定文件的权限信息 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")
        m_hdfs_filepath = os.path.dirname(hdfs_path)
        m_hdfs_filename = os.path.basename(hdfs_path)
        self.__m_HDFS_Handler__.set_permission(os.path.join(
            self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
            m_hdfs_filename).replace('\\', '/'),
                                               permission=permission)

    def HDFS_Connect(self, p_szURL, p_szUser):
        """ 连接HDFS, URL使用WEBFS协议 """
        m_HDFS_Protocal = p_szURL.split("://")[0]
        m_HDFS_NodePort = p_szURL[len(m_HDFS_Protocal) + 3:].split("/")[0]
        m_HDFS_WebFSURL = m_HDFS_Protocal + "://" + m_HDFS_NodePort
        self.__m_HDFS_User__ = p_szUser
        self.__m_HDFS_WebFSURL__ = m_HDFS_WebFSURL
        self.__m_HDFS_WebFSDir__ = p_szURL[len(m_HDFS_WebFSURL):]
        self.__m_HDFS_Handler__ = InsecureClient(url=m_HDFS_WebFSURL,
                                                 user=p_szUser,
                                                 root=self.__m_HDFS_WebFSDir__)
        # 尝试创建目录,如果目录不存在的话
        self.__m_HDFS_Handler__.makedirs(
            self.__m_HDFS_WebFSDir__.replace('\\', '/'))

    def HDFS_CD(self, p_szPath):
        self.__m_HDFS_WebFSDir__ = os.path.join(self.__m_HDFS_WebFSDir__,
                                                p_szPath)
        self.__m_HDFS_Handler__ = InsecureClient(url=self.__m_HDFS_WebFSURL__,
                                                 user=self.__m_HDFS_User__,
                                                 root=self.__m_HDFS_WebFSDir__)
        # 尝试创建目录,如果目录不存在的话
        self.__m_HDFS_Handler__.makedirs(
            self.__m_HDFS_WebFSDir__.replace('\\', '/'))

    def HDFS_status(self, hdfs_path=""):
        """ 返回目录下的文件 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        m_ReturnList = []
        m_Status = self.__m_HDFS_Handler__.status(hdfs_path)
        m_ReturnList.append((hdfs_path, m_Status))
        return m_ReturnList

    def HDFS_list(self, hdfs_path="", recusive=False):
        """ 返回目录下的文件 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        m_ReturnList = []
        if not recusive:
            for row in self.__m_HDFS_Handler__.list(hdfs_path, status=True):
                m_ReturnList.append((os.path.join(hdfs_path, row[0]), row[1]))
            return m_ReturnList
        else:
            for row in self.__m_HDFS_Handler__.list(hdfs_path, status=True):
                if row[1]['type'].upper() == 'DIRECTORY':
                    m_ReturnList.append(
                        (os.path.join(hdfs_path, row[0]).replace("\\",
                                                                 "/"), row[1]))
                    m_ReturnList.extend(
                        self.HDFS_list(os.path.join(hdfs_path,
                                                    row[0]).replace("\\", "/"),
                                       recusive=True))
                else:
                    m_ReturnList.append(
                        (os.path.join(hdfs_path, row[0]).replace("\\",
                                                                 "/"), row[1]))
            return m_ReturnList

    def HDFS_Download(self, hdfs_path="", local_path="", recusive=False):
        """ 从hdfs获取文件到本地 """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        # 如果本地没有对应目录,且local_path传递的是一个目录,则建立目录
        m_LocalPath = local_path
        if m_LocalPath.endswith("/") and not os.path.exists(m_LocalPath):
            os.makedirs(m_LocalPath)

        m_FileList = self.HDFS_list(recusive=recusive)
        for row in m_FileList:
            if fnmatch.fnmatch(row[0], hdfs_path):
                self.__m_HDFS_Handler__.download(row[0],
                                                 m_LocalPath,
                                                 overwrite=True)

    def HDFS_Upload(self, local_path, hdfs_path=""):
        """ 上传文件到hdfs """
        if self.__m_HDFS_Handler__ is None:
            raise HDFSWrapperException(
                "HDFS not connected. Please connect it frist.")

        for file in glob(local_path):
            if hdfs_path == "":
                m_hdfs_filepath = ""
                m_hdfs_filename = os.path.basename(file)
            else:
                if hdfs_path.endswith("/"):
                    m_hdfs_filepath = hdfs_path
                    m_hdfs_filename = os.path.basename(file)
                else:
                    m_hdfs_filepath = os.path.dirname(hdfs_path)
                    m_hdfs_filename = os.path.basename(hdfs_path)
            try:
                remote_status = self.__m_HDFS_Handler__.status(
                    hdfs_path=os.path.join(self.__m_HDFS_WebFSDir__,
                                           m_hdfs_filepath).replace('\\', '/'),
                    strict=True)
                if remote_status['type'] == "FILE":
                    # 远程以为是目录的地方其实放了一个奇怪的文件,于是删掉它
                    self.__m_HDFS_Handler__.delete(os.path.join(
                        self.__m_HDFS_WebFSDir__,
                        m_hdfs_filepath).replace('\\', '/'),
                                                   recursive=True)
                remote_status = self.__m_HDFS_Handler__.status(
                    os.path.join(self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
                                 m_hdfs_filename).replace('\\', '/'))
                if remote_status['type'] == "DIRECTORY":
                    # 远程目录已经存在, 会尝试删除这个目录
                    self.__m_HDFS_Handler__.delete(os.path.join(
                        self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
                        m_hdfs_filename).replace('\\', '/'),
                                                   recursive=True)
            except HdfsError:
                # 远程目录不存在,后续的upload会建立该目录
                pass
            self.__m_HDFS_Handler__.upload(os.path.join(
                self.__m_HDFS_WebFSDir__, m_hdfs_filepath,
                m_hdfs_filename).replace('\\', '/'),
                                           file,
                                           overwrite=True,
                                           cleanup=True)

    def Process_SQLCommand(self, p_szSQL):
        try:
            m_szSQL = p_szSQL.strip()
            matchObj = re.match(r"hdfs\s+connect\s+(.*)\s+with\s+user\s+(.*)$",
                                m_szSQL, re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_HDFSServer = str(matchObj.group(1)).strip()
                m_HDFSUser = str(matchObj.group(2)).strip()
                self.HDFS_Connect(m_HDFSServer, m_HDFSUser)
                return None, None, None, None, "Hdfs Server set successful."

            matchObj = re.match(r"hdfs\s+cd\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_HDFSPath = str(matchObj.group(1)).strip()
                self.HDFS_CD(m_HDFSPath)
                return None, None, None, None, "Hdfs root dir change successful."

            matchObj = re.match(r"hdfs\s+status\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_TargetFileList = str(matchObj.group(1)).strip()
                m_ReturnFileList = self.HDFS_status(m_TargetFileList)
                m_Result = []
                for (m_FileName, m_FileProperties) in m_ReturnFileList:
                    if m_FileProperties["type"] == "FILE":
                        m_PermissionMask = "-"
                    elif m_FileProperties["type"] == "DIRECTORY":
                        m_PermissionMask = "d"
                    else:
                        m_PermissionMask = "?"
                    if len(m_FileProperties["permission"]) == 3:
                        for m_nPos in range(0, 3):
                            if m_FileProperties["permission"][m_nPos] == "0":
                                m_PermissionMask = m_PermissionMask + "---"
                            elif m_FileProperties["permission"][m_nPos] == "1":
                                m_PermissionMask = m_PermissionMask + "--x"
                            elif m_FileProperties["permission"][m_nPos] == "2":
                                m_PermissionMask = m_PermissionMask + "-w-"
                            elif m_FileProperties["permission"][m_nPos] == "3":
                                m_PermissionMask = m_PermissionMask + "-wx"
                            elif m_FileProperties["permission"][m_nPos] == "4":
                                m_PermissionMask = m_PermissionMask + "r--"
                            elif m_FileProperties["permission"][m_nPos] == "5":
                                m_PermissionMask = m_PermissionMask + "r-x"
                            elif m_FileProperties["permission"][m_nPos] == "6":
                                m_PermissionMask = m_PermissionMask + "rw-"
                            elif m_FileProperties["permission"][m_nPos] == "7":
                                m_PermissionMask = m_PermissionMask + "rwx"
                            else:
                                m_PermissionMask = m_PermissionMask + "???"
                    else:
                        m_PermissionMask = m_PermissionMask + "?????????"
                    m_ModifiedTime = str(
                        datetime.datetime.utcfromtimestamp(
                            m_FileProperties["modificationTime"] /
                            1000).strftime("%Y-%m-%d %H:%M:%S"))
                    m_Result.append([
                        m_TargetFileList, m_PermissionMask,
                        m_FileProperties["owner"], m_FileProperties["group"],
                        m_FileProperties["length"], m_ModifiedTime
                    ])
                return "HDFS file status:", m_Result, ["Path", "Permission", "owner", "group", "Size", "Modified"], \
                       None, "Total " + str(len(m_Result)) + " files listed."

            matchObj = re.match(r"hdfs\s+rm\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                if matchObj:
                    m_Bak_WebFSDir = self.__m_HDFS_WebFSDir__
                    m_FileDeleted = str(matchObj.group(1)).strip()
                    m_FileDeletedPath = os.path.dirname(m_FileDeleted)
                    m_FileDeletedName = os.path.basename(m_FileDeleted)
                    self.HDFS_CD(m_FileDeletedPath)
                    m_FileList = self.HDFS_list(self.__m_HDFS_WebFSDir__,
                                                recusive=False)
                    for row in m_FileList:
                        if fnmatch.fnmatch(os.path.basename(row[0]),
                                           m_FileDeletedName):
                            self.__m_HDFS_Handler__.delete(row[0],
                                                           recursive=True)
                    # 重新返回原目录
                    self.HDFS_CD(m_Bak_WebFSDir)
                return None, None, None, None, "Hdfs file deleted successful."

            matchObj = re.match(r"hdfs\s+makedirs\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_Dir = str(matchObj.group(1)).strip()
                self.HDFS_makedirs(m_Dir)
                return None, None, None, None, "Hdfs directory created successful."

            matchObj = re.match(r"hdfs\s+set_permission\s+(.*)\s+(.*)$",
                                m_szSQL, re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_File = str(matchObj.group(1)).strip()
                m_FilePermission = str(matchObj.group(2)).strip()
                self.HDFS_setPermission(m_File, m_FilePermission)
                return None, None, None, None, "Hdfs set permission successful."

            m_FileUpload = ""
            m_TargetDir = None
            matchObj = re.match(r"hdfs\s+upload\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileUpload = str(matchObj.group(1)).strip()
                m_TargetDir = ""
            matchObj = re.match(r"hdfs\s+upload\s+(.*)\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileUpload = str(matchObj.group(1)).strip()
                m_TargetDir = str(matchObj.group(2)).strip()
            if m_TargetDir is not None:
                self.HDFS_Upload(m_FileUpload, m_TargetDir)
                return None, None, None, None, "Hdfs file upload successful."

            m_FileDownload = ""
            m_TargetDir = None
            matchObj = re.match(r"hdfs\s+download\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileDownload = str(matchObj.group(1)).strip()
                m_TargetDir = ""
            matchObj = re.match(r"hdfs\s+download\s+(.*)\s+(.*)$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_FileDownload = str(matchObj.group(1)).strip()
                m_TargetDir = str(matchObj.group(2)).strip()
            if m_TargetDir is not None:
                self.HDFS_Download(m_FileDownload, m_TargetDir)
                return None, None, None, None, "Hdfs file download successful."

            m_TargetFileList = None
            matchObj = re.match(r"hdfs\s+list(\s+)?$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_TargetFileList = ""
            matchObj = re.match(r"hdfs\s+list\s+(.*)?$", m_szSQL,
                                re.IGNORECASE | re.DOTALL)
            if matchObj:
                m_TargetFileList = str(matchObj.group(1)).strip()
            if m_TargetFileList is not None:
                m_ReturnFileList = self.HDFS_list(m_TargetFileList,
                                                  recusive=True)
                m_Result = []
                for (m_FileName, m_FileProperties) in m_ReturnFileList:
                    if m_FileProperties["type"] == "FILE":
                        m_PermissionMask = "-"
                    elif m_FileProperties["type"] == "DIRECTORY":
                        m_PermissionMask = "d"
                    else:
                        m_PermissionMask = "?"
                    if len(m_FileProperties["permission"]) == 3:
                        for m_nPos in range(0, 3):
                            if m_FileProperties["permission"][m_nPos] == "0":
                                m_PermissionMask = m_PermissionMask + "---"
                            elif m_FileProperties["permission"][m_nPos] == "1":
                                m_PermissionMask = m_PermissionMask + "--x"
                            elif m_FileProperties["permission"][m_nPos] == "2":
                                m_PermissionMask = m_PermissionMask + "-w-"
                            elif m_FileProperties["permission"][m_nPos] == "3":
                                m_PermissionMask = m_PermissionMask + "-wx"
                            elif m_FileProperties["permission"][m_nPos] == "4":
                                m_PermissionMask = m_PermissionMask + "r--"
                            elif m_FileProperties["permission"][m_nPos] == "5":
                                m_PermissionMask = m_PermissionMask + "r-x"
                            elif m_FileProperties["permission"][m_nPos] == "6":
                                m_PermissionMask = m_PermissionMask + "rw-"
                            elif m_FileProperties["permission"][m_nPos] == "7":
                                m_PermissionMask = m_PermissionMask + "rwx"
                            else:
                                m_PermissionMask = m_PermissionMask + "???"
                    else:
                        m_PermissionMask = m_PermissionMask + "?????????"
                    m_ModifiedTime = str(
                        datetime.datetime.utcfromtimestamp(
                            m_FileProperties["modificationTime"] /
                            1000).strftime("%Y-%m-%d %H:%M:%S"))
                    m_Result.append([
                        m_FileProperties["pathSuffix"], m_PermissionMask,
                        m_FileProperties["owner"], m_FileProperties["group"],
                        m_FileProperties["length"], m_ModifiedTime
                    ])
                return "HDFS file List:", m_Result, ["Path", "Permission", "owner", "group", "Size", "Modified"], \
                       None, "Total " + str(len(m_Result)) + " files listed."
            return None, None, None, None, "Unknown HDFS Command."
        except (HDFSWrapperException, HdfsError) as he:
            if "SQLCLI_DEBUG" in os.environ:
                print('traceback.print_exc():\n%s' % traceback.print_exc())
                print('traceback.format_exc():\n%s' % traceback.format_exc())
            raise SQLCliException(he.message)