Example #1
0
def upload_to_hdfs(input_dir, output_dir, chunk_size):
    # locate files in directory
    files = [
        os.path.abspath("{}/{}".format(input_dir, f))
        for f in listdir(input_dir) if isfile(join(input_dir, f))
    ]
    tmp_dir = "{}/tmp".format(input_dir)

    # setup temp dir
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    # split files into 128mb chunks
    for f in files:
        fs = FileSplit(file=f,
                       splitsize=(chunk_size) * 1e6,
                       output_dir=tmp_dir)
        fs.split(callback=split_callback)

    # upload to hdfs
    hdfs_client = InsecureClient("http://{}:9870".format(
        settings.HDFS_HOST_VALUE),
                                 user=settings.HDFS_USER_VALUE)

    # delete existing output dir
    if hdfs_client.content(output_dir, strict=False) != None:
        hdfs_client.delete(output_dir, recursive=True)

    # upload files to tmp dir
    remote_path = hdfs_client.upload(hdfs_path="/tmp",
                                     local_path=tmp_dir,
                                     n_threads=-1,
                                     overwrite=True)

    # rename to output_dir
    hdfs_client.rename("/tmp", output_dir)

    print(
        "{} files uploaded to hdfs host '{}{}'  ({} file chunks total)".format(
            len(files),
            settings.HDFS_HOST_VALUE,
            output_dir,
            len(split_files),
        ))
    # delete temp files
    shutil.rmtree(tmp_dir)

    return hdfs_file_paths
Example #2
0
def renameFiles(ip='172.20.10.2',
                port='9870',
                username='******',
                MainName='result-part',
                SubName='.json',
                dirPath='/tmp/Cathay/'):
    client = InsecureClient("http://" + ip + ":" + port, user=username)
    if dirPath[-1] != '/':
        dirPath += '/'
    fns = client.list(dirPath)
    for fn in fns:
        if 'part-' in fn:
            num = str(int(fn.split('part-')[-1]) + 1)
            client.rename(dirPath + fn, dirPath + MainName + num + SubName)
    return str(fns) + "\n     Change to     \n" + str(client.list(dirPath))
Example #3
0
class HDFSLibrary:
    """
        Test library for working with HDFS
    """
    WEB_HDFS_URL = ""
    client = ""

    def __init__(self, namenode="localhost", port="50070"):
        self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port)
        print namenode, ">>", port, ">>", self.WEB_HDFS_URL
        self.client = InsecureClient(self.WEB_HDFS_URL)

    def check_hdfs_file_exists(self, file_path, stop=False):
        if None == self.client.status(file_path, strict=False):
            if stop:
                print "ERROR: Error: File does not exist: ", file_path
                return "ERROR: Error: File does not exist: ", file_path
                # exit(172)
            return False
        return True

    def get_hdfs_file_content(self, file_path):
        self.check_hdfs_file_exists(file_path, stop=True)
        data = ""
        with self.client.read(file_path) as reader:
            for line in reader:
                data += line
        return data

    def search_string_in_hdfs_file(self,
                                   file_path,
                                   text1,
                                   text2="aqwszx",
                                   text3="xzswqa"):
        ret = self.check_hdfs_file_exists(file_path, stop=True)
        found = "" if ret else ret
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) == -1 and line.find(
                        text2) == -1 and line.find(text3) == -1:
                    continue
                found += line
        return found

    def hdfs_file_should_not_contain(self,
                                     file_path,
                                     text1,
                                     text2="aqwszx",
                                     text3="xzswqa"):
        self.check_hdfs_file_exists(file_path, stop=True)
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) != -1 or line.find(
                        text2) != -1 or line.find(text3) != -1:
                    return False
        return True

    ########################
    # # BASIC FUNCTIONS: # #
    ########################
    def get_hdfs_file_folder_content_summary(self, file_path):
        """
        Retrieving a file or folder content summary.
        :return: returns a file or folder content summary.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.content(file_path)

    def get_hdfs_file_folder_status(self, file_path):
        """
        Retrieving a file or folder status.
        :return: returns a file or folder status.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.status(file_path)

    def list_hdfs_directory(self, folder_path):
        """
        Listing all files inside a directory.
        :return: returns a file list.
        """
        self.check_hdfs_file_exists(folder_path, stop=True)
        return self.client.list(folder_path)

    def move_hdfs_file(self, old_path, new_path):
        """
        Renaming ("moving") a file.
        :return: NA
        """
        self.check_hdfs_file_exists(old_path, stop=True)
        self.client.rename(old_path, new_path)

    def delete_hdfs_file(self, file_path):
        """
        Deleting a file or folder recursively.
        :return: returns `True` if the deletion was successful otherwise `False`
        """
        self.check_hdfs_file_exists(file_path)
        return self.client.delete(file_path, recursive=True)

    def copy_to_local_hdfs_file(self, hdfs_path, local_path):
        """
        Copy a file or folder from HDFS to local.
        :return: local_path
        """
        self.check_hdfs_file_exists(hdfs_path)
        return self.client.download(hdfs_path,
                                    local_path,
                                    overwrite=True,
                                    n_threads=4)

    def copy_from_local_hdfs_file(self, local_path, hdfs_path):
        """
        Copy a file or folder from local to HDFS.
        :return: hdfs_path
        """
        return self.client.upload(hdfs_path,
                                  local_path,
                                  overwrite=True,
                                  n_threads=4)

    def get_hdfs_file_checksum(self, file_path):
        """
        Get the checksum value for file
        :return: checksum
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.checksum(file_path)

    def create_hdfs_dir(self, dir_path, perm=755):
        """
        Create a directory or recursive dirs on HDFS
        :return: NA
        """
        self.client.makedirs(dir_path, permission=perm)
Example #4
0
class HadoopFileSystem(object):
    def __init__(self, *opts):
        self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER'])
         
#     def make_tree(self, datasourceid, client, path):
#         tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[])
#         try: lst = client.list(path, status=True)
#         except:
#             pass #ignore errors
#         else:
#             for fsitem in lst:
#                 fn = os.path.join(path, fsitem[0])
#                 if fsitem[1]['type'] == "DIRECTORY":
#                     tree['children'].append(make_hdfs_tree(datasourceid, client, fn))
#                 else:
#                     tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []})
#         return tree

    def make_json(self, datasourceid, base, relative_path):
        path = os.path.join(base, relative_path)
        data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) }
        status = self.client.status(path, False)

        if status is not None:
            if status['type'] == "DIRECTORY":
                data_json['type'] = DataType.Folder
                data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)]
            else:
                data_json['type'] = DataType.File
        #print(json.dumps(data_json))
        return data_json
    
    def makedirs(self, path):
        try: 
            self.client.makedirs(path)
        except:
            return None
        return path
    
    def delete(self, path):
        try: 
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e: print(e)
        
    def addfolder(self, path):
        i = 0
        while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None:
            i += 1
        return self.makedirs(os.path.join(path, "New Folder ({0})".format(i)))
    
    def rename(self, oldpath, newpath):
        try:
            self.client.rename(oldpath, newpath)
        except Exception as e: print(e)
    
    def saveUpload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            self.client.upload(os.path.dirname(fullpath), localpath, True)
        except:
            pass
        
    def download(self, fullpath):
        status = self.client.status(fullpath, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
            return self.client.download(fullpath, localpath, True)
        else:
            return None
Example #5
0
class HadoopFileSystem():
    def __init__(self, url, user):
        u = urlsplit(url)
        if u.scheme != 'http' and u.scheme != 'https':
            raise ValueError("Invalid name node address")

        self.url = urlunparse((u.scheme, u.netloc, '', '', '', ''))
        self.client = InsecureClient(self.url, user=user)
        self.localdir = u.path
        self.prefix = 'HDFS'

    def normalize_path(self, path):
        path = os.path.normpath(path)
        path = self.strip_prefix(path)
        while path and path[0] == os.sep:
            path = path[1:]
        return os.path.join(self.localdir, path)

    def strip_prefix(self, path):
        return path[len(self.prefix):] if path.startswith(
            self.prefix) else path

    def strip_root(self, path):
        path = self.strip_prefix(path)
        if path.startswith(self.url):
            path = path[len(self.url):]
            if not path.startswith(self.localdir):
                raise 'Invalid hdfs path. It must start with the root directory'
        return path[len(self.localdir):] if path.startswith(
            self.localdir) else path

    def create_folder(self, path):
        try:
            path = self.normalize_path(path)
            self.client.makedirs(path)
        except:
            return None
        return path

    def remove(self, path):
        try:
            path = self.normalize_path(path)
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e:
            print(e)

    def rename(self, oldpath, newpath):
        try:
            oldpath = self.normalize_path(oldpath)
            newpath = self.normalize_path(newpath)
            self.client.rename(oldpath, newpath)
        except Exception as e:
            print(e)

    def get_files(self, path):
        path = self.normalize_path(path)
        files = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] != "DIRECTORY":
                files.append(f)
        return files

    def get_folders(self, path):
        path = self.normalize_path(path)
        folders = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] == "DIRECTORY":
                folders.append(f)
        return folders

    def exists(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return not (status is None)

    def isdir(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "DIRECTORY"

    def isfile(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "FILE"

    def read(self, path):
        path = self.normalize_path(path)
        with self.client.read(path) as reader:
            return reader.read().decode('utf-8')

    def write(self, path, content):
        path = self.normalize_path(path)
        self.client.write(path, content)

    def make_json(self, path):
        normalized_path = self.normalize_path(path)
        data_json = {
            'path': urljoin(self.url, normalized_path),
            'text': os.path.basename(path)
        }
        status = self.client.status(normalized_path, False)

        if status is not None:
            data_json['folder'] = status['type'] == "DIRECTORY"
            if status['type'] == "DIRECTORY":
                data_json['nodes'] = [
                    self.make_json(os.path.join(path, fn))
                    for fn in self.client.list(normalized_path)
                ]
        #print(json.dumps(data_json))
        return data_json

    def save_upload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(),
                                 os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            if isfile(fullpath):
                fullpath = os.path.dirname(fullpath)
            self.client.upload(self.normalize_path(fullpath), localpath, True)
        except:
            pass

    def download(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(),
                                     os.path.basename(path))
            return self.client.download(path, localpath, True)
        else:
            return None
Example #6
0

print('Begin')

# 1. Make a directory named: /activity1/
client.makedirs(hdfs_path='/activity1/', permission=None)
client.makedirs(hdfs_path='/activity1/data/', permission=None)

# 2. Put the file RandomText.txt into HDFS as the path: /activity1/data/RandomText.txt
client.upload(hdfs_path='/activity1/data/', local_path='./RandomText.txt')

# 3. List the contents of the directory /activity1/data/
print(client.list('/activity1/data'))

# 4. Move the HDFS file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt
client.rename('/activity1/data/RandomText.txt',
              '/activity1/data/NotSoRandomText.txt')

with open('./RandomText.txt', 'r') as f:
    for line in f:
        temp = line

# 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt
client.write(hdfs_path='/activity1/data/NotSoRandomText.txt',
             data=temp,
             append=True)

# 6. List the disk space used by the directory /activity1/data/
diskSpaceUsed = client.content('/activity1/data/', strict=True)
print(diskSpaceUsed['spaceConsumed'])

# 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt
def start_service():
    global isRunning
    global stopRunning
    global currentFileToImport

    stopRunning = False
    isRunning = True

    while not stopRunning:

        # Connexion au cluster Cassandra
        cluster = Cluster(contact_points=['cassandra'], port=9042)
        session = cluster.connect()
        session.default_timeout = 10

        # Création de la base de données si elle n'existe pas
        replicationFactor = 2
        forceReplace = False
        db_name = "pjm"
        columnFamilyName = "estimated_load_hourly"
        create_database(session, db_name, columnFamilyName, replicationFactor,
                        forceReplace)

        try:
            # Chemin de téléchargement du fichier des données
            hdfs_file_path = "/user/root/data/pjm"

            # Connexion au client HDFS
            client = InsecureClient(url='http://namenode:9870', user='******')

            # Création du dossier de stockage des fichiers traités
            client.makedirs(hdfs_file_path + '/imported')

            # Récupération de la liste des fichiers à traiter
            files = client.list(hdfs_file_path, status=True)
            #print(files)
            #print(client.parts(hdfs_file_path))

            # Traitement des fichiers
            for pjm_file, filestatus in files:
                # Mise à jour du nom du fichier pour le status
                currentFileToImport = pjm_file
                print(pjm_file)

                if filestatus['type'] == 'FILE':
                    # Lecture en mémoire du fichier
                    with client.read(hdfs_file_path + "/" + pjm_file,
                                     encoding='utf-8') as reader:
                        df = pd.read_csv(reader, sep=',', header='infer')

                        # ----- Transformation initiale du Dataframe -----
                        # Suppression des colonnes inutiles
                        df = df.drop(columns=[
                            'datetime_beginning_ept', 'datetime_beginning_utc',
                            'datetime_ending_utc'
                        ])

                        # Suppression des observations en doublon => conservation de la 1e occurence
                        df = df.drop_duplicates(
                            subset=['datetime_ending_ept', 'load_area'],
                            keep='first')

                        # Formattage de la date de l'observation
                        df['datetime_measure'] = df.apply(
                            lambda x: pd.to_datetime(
                                x[['datetime_ending_ept']],
                                format='%m/%d/%Y %I:%M:%S %p'),
                            axis=1)
                        df['datetime_measure'] = df['datetime_measure'].astype(
                            'datetime64[ns]')
                        df = df.drop(columns=['datetime_ending_ept'])

                        # Import des données brutes dans Cassandra (table "estimated_load_hourly" )

                        #for i in df.index:
                        #request_insert = "INSERT INTO " + columnFamilyName + " (datetime_ending_ept, load_area, estimated_load) " \
                        #	+ " VALUES ('" + str(df['datetime_measure'][i]) + "','" + df['load_area'][i] + "', " + str(df['estimated_load_hourly'][i]) + ");"
                        #print(request_insert)
                        #session.execute(request_insert)

                        # ----- Nettoyage des données ---------
                        # Objectif : Conservation du total de la consommation par heure

                        # Suppression de la colonne de la zone
                        df = df.drop(columns=['load_area'])

                        # Calcul du total de consommation par heure
                        df = df.groupby(by=['datetime_measure']).sum()

                        # Ajout de colonnes complémentaires à la date de la mesure
                        df['date'] = df.index.date
                        df['annee'] = df.index.year
                        df['mois'] = df.index.month
                        df['semaine'] = df.index.isocalendar().week
                        df['heure'] = df.index.hour
                        df['jour_annee'] = df.index.dayofyear
                        df['trimestre'] = df.index.quarter
                        df['jour_semaine'] = df.index.dayofweek
                        df['jour_mois'] = df.index.day

                        # Ajout des très rares heures manquantes pour respecter la fréquence des observations
                        checkmonth = pd.DataFrame(
                            df.groupby(['jour_annee',
                                        'annee'])['annee'].count())
                        checkmonth.rename(columns={'annee': 'nb'},
                                          inplace=True)
                        df_missing = checkmonth[checkmonth.nb != 24]
                        df_missing = df_missing.reset_index()

                        Hour0_24 = pd.DataFrame(np.arange(24))
                        df_to_append = pd.DataFrame()

                        for x, y in zip(df_missing['jour_annee'],
                                        df_missing['annee']):
                            print("Jour avec des heures manquantes :", x, y)

                            df_encours = df[(df.jour_annee == x)
                                            & (df.annee == y)]
                            h_missing = Hour0_24[~Hour0_24[0].isin(df_encours.
                                                                   heure)]

                            h = h_missing.iloc[0].name
                            df_to_append = df_to_append.append(
                                df[(df.jour_annee == x) & (df.annee == y) &
                                   (df.heure == (h + 1))])

                        df_to_append.heure = df_to_append.heure - 1
                        df = df.append(df_to_append)

                        # Insertion des données nettoyées dans CASSANDRA (table "estimated_load_hourly_summary" )
                        for label, row in df.iterrows():
                            print(label)
                            request_insert = "INSERT INTO " + columnFamilyName + "_summary " \
                             + " (datetime_est_load " \
                             + " ,date_est_load " \
                             + " ,annee " \
                             + " ,mois " \
                             + " ,semaine " \
                             + " ,heure " \
                             + " ,trimestre " \
                             + " ,jour_annee " \
                             + " ,jour_semaine " \
                             + " ,jour_mois " \
                             + " ,total_estimated_load " \
                             + " ) " \
                             + " VALUES ('" + str(label) + "' " \
                             + " ,'" + str(row['date']) + "' " \
                             + " ," + str(row['annee']) + " " \
                             + " ," + str(row['mois']) + " " \
                             + " ," + str(row['semaine']) + " " \
                             + " ," + str(row['heure']) + " " \
                             + " ," + str(row['trimestre']) + " " \
                             + " ," + str(row['jour_annee']) + " " \
                             + " ," + str(row['jour_semaine']) + " " \
                             + " ," + str(row['jour_mois']) + " " \
                             + " ," + str(row['estimated_load_hourly']) + " );"

                            print(request_insert)
                            session.execute(request_insert)

                    # Déplacement du fichier traités dans le dossier "imported"
                    client.rename(hdfs_file_path + "/" + pjm_file,
                                  hdfs_file_path + '/imported/' + pjm_file)

        except HdfsError as ex:
            # Traitement des erreur HDFS
            print(str(ex))

        # Fin des fichiers à traiter
        currentFileToImport = ""

        #closing Cassandra connection
        session.shutdown()

        # Attendre avant de vérifier la présence d'autres fichiers
        time.sleep(30)

    return ""
Example #8
0
class WebHDFSStore():
    '''
    A file store based on the WebHDFS protocol.
    '''
    # Set a refresh-date to indicate when we did this lookup:
    refresh_date = datetime.datetime.utcnow().isoformat(
        timespec='milliseconds') + 'Z'

    def __init__(self, service_id, user_override=None):
        self.service_id = service_id
        self.webhdfs_url = HADOOPS[service_id]['webhdfs_url']
        self.webhdfs_user = HADOOPS[service_id]['webhdfs_user']
        if user_override:
            self.webhdfs_user = user_override
        self.id_prefix = HADOOPS[service_id]['id_prefix']
        self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user)

    def put(self, local_path, hdfs_path, backup_and_replace=False):
        # Get the status of the destination:
        dest_status = self.client.status(hdfs_path, strict=False)

        # Handle files or directories:
        if os.path.isfile(local_path):
            hdfs_path = self._combine_paths(dest_status, local_path, hdfs_path)
            self._upload_file(local_path, hdfs_path, backup_and_replace)
        elif os.path.isdir(local_path):
            # TODO, if it's a directory
            raise Exception(
                "Cannot upload anything other than single files at this time!")
        else:
            raise Exception("Unknown path type! Can't handle %s" % local_path)

    def _combine_paths(self, dest_status, local_path, hdfs_path):
        # If the hdfs_path is a directory, combine the paths:
        if dest_status and dest_status['type'] == 'DIRECTORY':
            combined_path = psp.join(hdfs_path, local_path)
            logger.info("Using combined path: %s" % combined_path)
            return combined_path
        else:
            # Otherwise, just return the path:
            return hdfs_path

    def _upload_file(self, local_path, hdfs_path, backup_and_replace=False):
        """
        Copy up to HDFS, making it suitably atomic by using a temporary filename during upload.

        :return: None
        """

        # Set up flag to record outcome:
        success = False

        # Calculate hash of local file:
        logger.info("Calculating hash of %s" % local_path)
        if not os.path.isfile(local_path):
            raise Exception("Cannot upload %s - individual files only!")
        local_hash = calculate_sha512_local(local_path)
        logger.info("Local %s hash is %s " % (local_path, local_hash))

        #
        # TODO Allow upload  to overwrite truncated files?
        #

        # Check if the destination file exists:
        already_exists = self.exists(hdfs_path)
        if already_exists and not backup_and_replace:
            logger.warning(
                "Path %s already exists! No upload will be attempted." %
                hdfs_path)
        else:
            # Upload to a temporary path:
            tmp_path = "%s_temp_" % hdfs_path

            # Now upload the file, allowing overwrites as this is a temporary file and
            # simultanous updates should not be possible:
            logger.info("Uploading as %s" % tmp_path)
            with open(local_path, 'rb') as reader, self.client.write(
                    tmp_path, overwrite=True) as writer:
                while True:
                    data = reader.read(10485760)
                    if not data:
                        break
                    writer.write(data)

            # If set, backup-and-replace as needed:
            if backup_and_replace and already_exists:
                date_stamp = datetime.datetime.now().strftime(
                    '%Y-%m-%d_%H-%M-%S')
                backup_path = "%s.bkp_%s" % (hdfs_path, date_stamp)
                logger.warning("Renaming %s to %s..." %
                               (hdfs_path, backup_path))
                self.client.rename(hdfs_path, backup_path)

            # Move the uploaded file into the right place:
            logger.info("Renaming %s to %s..." % (tmp_path, hdfs_path))
            self.client.rename(tmp_path, hdfs_path)

            # Give the namenode a moment to catch-up with itself and then check it's there:
            # FIXME I suspect this is only needed for our ancient HDFS
            time.sleep(2)
            status = self.client.status(hdfs_path)

        logger.info("Calculating hash of HDFS file %s" % hdfs_path)
        hdfs_hash = self.calculate_sha512(hdfs_path)
        logger.info("HDFS %s hash is %s " % (hdfs_path, hdfs_hash))
        if local_hash != hdfs_hash:
            raise Exception("Local & HDFS hashes do not match for %s" %
                            local_path)
        else:
            logger.info("Hashes are equal!")
            success = True

        # Log successful upload:
        logger.warning("Upload completed for %s" % hdfs_path)

        # And return success flag so caller knows it worked:
        return success

    def move(self, local_path, hdfs_path):
        # Perform the PUT first:
        success = self.put(local_path, hdfs_path)
        # And delete the local file if that worked:
        if success == True:
            os.remove(local_path)

    def calculate_sha512(self, path):
        '''
        Calculate the SHA512 hash of a single file on HDFS
        '''
        with self.client.read(path) as reader:
            file_hash = calculate_reader_hash(reader, path)

        return file_hash

    def _to_info(self, path, status):
        # Add the file path:
        status['file_path'] = path
        # Classify based on HDFS storage conventions:
        item = HdfsPathParser(status).to_dict()
        # Work out the permissions string:
        if status['permission'].isnumeric():
            permissions = permissions_octal_to_string(int(
                status['permission']))
            if status['type'] == 'DIRECTORY':
                permissions = "d" + permissions
            else:
                permissions = "-" + permissions
        else:
            permissions = status['permission']
        # Defined fields based on directory/file status
        if permissions[0] == 'd':
            fs_type = 'directory'
            access_url = '%s/webhdfs/v1%s?op=LISTSTATUS&user.name=%s' % (
                self.webhdfs_url, item['file_path'], self.webhdfs_user)
        else:
            fs_type = 'file'
            access_url = '%s/webhdfs/v1%s?op=OPEN&user.name=%s' % (
                self.webhdfs_url, item['file_path'], self.webhdfs_user)
        # And return as a 'standard' dict:
        return {
            'id': '%s%s' % (self.id_prefix, item['file_path']),
            'refresh_date_dt': self.refresh_date,
            'file_path_s': item['file_path'],
            'file_size_l': item['file_size'],
            'file_ext_s': item['file_ext'],
            'file_name_s': item['file_name'],
            'permissions_s': permissions,
            'hdfs_replicas_i': item['number_of_replicas'],
            'hdfs_user_s': item['user_id'],
            'hdfs_group_s': item['group_id'],
            'modified_at_dt': "%sZ" % item['modified_at'],
            'timestamp_dt': "%sZ" % item['timestamp'],
            'year_i': item['timestamp'][0:4],
            'recognised_b': item['recognised'],
            'kind_s': item['kind'],
            'collection_s': item['collection'],
            'stream_s': item['stream'],
            'job_s': item['job'],
            'layout_s': item['layout'],
            'hdfs_service_id_s': self.service_id,
            'hdfs_type_s': fs_type,
            'access_url_s': access_url
        }

    def list(self, path, recursive=False):
        # Handle non-existant entry, or a file:
        path_status = self.client.status(path, strict=False)
        if path_status is None:
            raise Exception("No such file or directory: %s" % path)
        elif path_status['type'] == 'FILE':
            # Plain old file:
            yield self._to_info(path, path_status)
        else:
            # Handle folders:
            if recursive:
                for dir_info, dirs_info, files_info in self.client.walk(
                        path, status=True):
                    dir_path, dir_status = dir_info
                    for file_name, file_status in files_info:
                        file_path = psp.join(dir_path, file_name)
                        yield self._to_info(file_path, file_status)
            else:
                for file_name, file_status in self.client.list(path,
                                                               status=True):
                    file_path = psp.join(path, file_name)
                    yield self._to_info(file_path, file_status)

    def exists(self, path):
        status = self.client.status(path, strict=False)
        if status:
            return True
        else:
            return False

    def rm(self, path):
        # And delete from HDFS (usually prevented by API proxy)
        # Hard-coded to never act recursively - if you want that, do it manually via the back-end.
        self.client.delete(path, recursive=False)

    def stream(self, path, offset=0, length=None):
        # NOTE our WebHDFS service is very old and uses 'len' not 'length' for controlling the response length:
        # The API proxy we use attempts to remedy this by mapping any 'length' parameter to 'len'.
        return self.client.read(path, offset=offset, length=length)

    def read(self, path, offset=0, length=None):
        with self.stream(path, offset, length) as reader:
            while True:
                data = reader.read(10485760)
                if not data:
                    break
                yield data

    def lsr_to_items(self, reader):
        """
        This task processes a raw list of files generated by the hadoop fs -lsr command.

        As this can be a very large list, it avoids reading it all into memory. It
        parses each line, and yields a suitable stream of parsed objects matching the WebHDFS API.
        """
        for line in reader:
            if "lsr: DEPRECATED: Please use 'ls -R' instead." in line:
                logger.warning(line)
            else:
                permissions, number_of_replicas, userid, groupid, filesize, modification_date, modification_time, filename = line.split(
                    None, 7)
                filename = filename.strip()
                timestamp = datetime.datetime.strptime(
                    '%s %s' % (modification_date, modification_time),
                    '%Y-%m-%d %H:%M')
                info = {
                    'permission': permissions,
                    'replication': number_of_replicas,
                    'owner': userid,
                    'group': groupid,
                    'length': filesize,
                    'modificationTime': timestamp.timestamp() * 1000,
                    'pathSuffix': filename
                }
                # Skip directories:
                if permissions[0] != 'd':
                    yield self._to_info(filename, info)
                    info['type'] = 'DIRECTORY'
                else:
                    info['type'] = 'FILE'