Python InsecureClient.rename Examples

Programming Language: Python

Namespace/Package Name: hdfs

Class/Type: InsecureClient

Method/Function: rename

Examples at hotexamples.com: 8

Python InsecureClient.rename - 8 examples found. These are the top rated real world Python examples of hdfs.InsecureClient.rename extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

InsecureClient(30)

delete(30)

list(30)

makedirs(30)

read(30)

upload(30)

download(26)

status(23)

content(16)

rename(8)

checksum(1)

set_owner(1)

Example #1

Show file

File: upload.py Project: rlzh/cs754-p3

def upload_to_hdfs(input_dir, output_dir, chunk_size):
    # locate files in directory
    files = [
        os.path.abspath("{}/{}".format(input_dir, f))
        for f in listdir(input_dir) if isfile(join(input_dir, f))
    ]
    tmp_dir = "{}/tmp".format(input_dir)

    # setup temp dir
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    # split files into 128mb chunks
    for f in files:
        fs = FileSplit(file=f,
                       splitsize=(chunk_size) * 1e6,
                       output_dir=tmp_dir)
        fs.split(callback=split_callback)

    # upload to hdfs
    hdfs_client = InsecureClient("http://{}:9870".format(
        settings.HDFS_HOST_VALUE),
                                 user=settings.HDFS_USER_VALUE)

    # delete existing output dir
    if hdfs_client.content(output_dir, strict=False) != None:
        hdfs_client.delete(output_dir, recursive=True)

    # upload files to tmp dir
    remote_path = hdfs_client.upload(hdfs_path="/tmp",
                                     local_path=tmp_dir,
                                     n_threads=-1,
                                     overwrite=True)

    # rename to output_dir
    hdfs_client.rename("/tmp", output_dir)

    print(
        "{} files uploaded to hdfs host '{}{}'  ({} file chunks total)".format(
            len(files),
            settings.HDFS_HOST_VALUE,
            output_dir,
            len(split_files),
        ))
    # delete temp files
    shutil.rmtree(tmp_dir)

    return hdfs_file_paths

Example #2

Show file

File: demo.py Project: jasonwu0908/interview

def renameFiles(ip='172.20.10.2',
                port='9870',
                username='******',
                MainName='result-part',
                SubName='.json',
                dirPath='/tmp/Cathay/'):
    client = InsecureClient("http://" + ip + ":" + port, user=username)
    if dirPath[-1] != '/':
        dirPath += '/'
    fns = client.list(dirPath)
    for fn in fns:
        if 'part-' in fn:
            num = str(int(fn.split('part-')[-1]) + 1)
            client.rename(dirPath + fn, dirPath + MainName + num + SubName)
    return str(fns) + "\n     Change to     \n" + str(client.list(dirPath))

Example #3

Show file

class HDFSLibrary:
    """
        Test library for working with HDFS
    """
    WEB_HDFS_URL = ""
    client = ""

    def __init__(self, namenode="localhost", port="50070"):
        self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port)
        print namenode, ">>", port, ">>", self.WEB_HDFS_URL
        self.client = InsecureClient(self.WEB_HDFS_URL)

    def check_hdfs_file_exists(self, file_path, stop=False):
        if None == self.client.status(file_path, strict=False):
            if stop:
                print "ERROR: Error: File does not exist: ", file_path
                return "ERROR: Error: File does not exist: ", file_path
                # exit(172)
            return False
        return True

    def get_hdfs_file_content(self, file_path):
        self.check_hdfs_file_exists(file_path, stop=True)
        data = ""
        with self.client.read(file_path) as reader:
            for line in reader:
                data += line
        return data

    def search_string_in_hdfs_file(self,
                                   file_path,
                                   text1,
                                   text2="aqwszx",
                                   text3="xzswqa"):
        ret = self.check_hdfs_file_exists(file_path, stop=True)
        found = "" if ret else ret
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) == -1 and line.find(
                        text2) == -1 and line.find(text3) == -1:
                    continue
                found += line
        return found

    def hdfs_file_should_not_contain(self,
                                     file_path,
                                     text1,
                                     text2="aqwszx",
                                     text3="xzswqa"):
        self.check_hdfs_file_exists(file_path, stop=True)
        with self.client.read(file_path) as reader:
            for line in reader:
                if line.find(text1) != -1 or line.find(
                        text2) != -1 or line.find(text3) != -1:
                    return False
        return True

    ########################
    # # BASIC FUNCTIONS: # #
    ########################
    def get_hdfs_file_folder_content_summary(self, file_path):
        """
        Retrieving a file or folder content summary.
        :return: returns a file or folder content summary.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.content(file_path)

    def get_hdfs_file_folder_status(self, file_path):
        """
        Retrieving a file or folder status.
        :return: returns a file or folder status.
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.status(file_path)

    def list_hdfs_directory(self, folder_path):
        """
        Listing all files inside a directory.
        :return: returns a file list.
        """
        self.check_hdfs_file_exists(folder_path, stop=True)
        return self.client.list(folder_path)

    def move_hdfs_file(self, old_path, new_path):
        """
        Renaming ("moving") a file.
        :return: NA
        """
        self.check_hdfs_file_exists(old_path, stop=True)
        self.client.rename(old_path, new_path)

    def delete_hdfs_file(self, file_path):
        """
        Deleting a file or folder recursively.
        :return: returns `True` if the deletion was successful otherwise `False`
        """
        self.check_hdfs_file_exists(file_path)
        return self.client.delete(file_path, recursive=True)

    def copy_to_local_hdfs_file(self, hdfs_path, local_path):
        """
        Copy a file or folder from HDFS to local.
        :return: local_path
        """
        self.check_hdfs_file_exists(hdfs_path)
        return self.client.download(hdfs_path,
                                    local_path,
                                    overwrite=True,
                                    n_threads=4)

    def copy_from_local_hdfs_file(self, local_path, hdfs_path):
        """
        Copy a file or folder from local to HDFS.
        :return: hdfs_path
        """
        return self.client.upload(hdfs_path,
                                  local_path,
                                  overwrite=True,
                                  n_threads=4)

    def get_hdfs_file_checksum(self, file_path):
        """
        Get the checksum value for file
        :return: checksum
        """
        self.check_hdfs_file_exists(file_path, stop=True)
        return self.client.checksum(file_path)

    def create_hdfs_dir(self, dir_path, perm=755):
        """
        Create a directory or recursive dirs on HDFS
        :return: NA
        """
        self.client.makedirs(dir_path, permission=perm)

Example #4

Show file

class HadoopFileSystem(object):
    def __init__(self, *opts):
        self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER'])
         
#     def make_tree(self, datasourceid, client, path):
#         tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[])
#         try: lst = client.list(path, status=True)
#         except:
#             pass #ignore errors
#         else:
#             for fsitem in lst:
#                 fn = os.path.join(path, fsitem[0])
#                 if fsitem[1]['type'] == "DIRECTORY":
#                     tree['children'].append(make_hdfs_tree(datasourceid, client, fn))
#                 else:
#                     tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []})
#         return tree

    def make_json(self, datasourceid, base, relative_path):
        path = os.path.join(base, relative_path)
        data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) }
        status = self.client.status(path, False)

        if status is not None:
            if status['type'] == "DIRECTORY":
                data_json['type'] = DataType.Folder
                data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)]
            else:
                data_json['type'] = DataType.File
        #print(json.dumps(data_json))
        return data_json
    
    def makedirs(self, path):
        try: 
            self.client.makedirs(path)
        except:
            return None
        return path
    
    def delete(self, path):
        try: 
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e: print(e)
        
    def addfolder(self, path):
        i = 0
        while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None:
            i += 1
        return self.makedirs(os.path.join(path, "New Folder ({0})".format(i)))
    
    def rename(self, oldpath, newpath):
        try:
            self.client.rename(oldpath, newpath)
        except Exception as e: print(e)
    
    def saveUpload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            self.client.upload(os.path.dirname(fullpath), localpath, True)
        except:
            pass
        
    def download(self, fullpath):
        status = self.client.status(fullpath, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath))
            return self.client.download(fullpath, localpath, True)
        else:
            return None

Example #5

Show file

class HadoopFileSystem():
    def __init__(self, url, user):
        u = urlsplit(url)
        if u.scheme != 'http' and u.scheme != 'https':
            raise ValueError("Invalid name node address")

        self.url = urlunparse((u.scheme, u.netloc, '', '', '', ''))
        self.client = InsecureClient(self.url, user=user)
        self.localdir = u.path
        self.prefix = 'HDFS'

    def normalize_path(self, path):
        path = os.path.normpath(path)
        path = self.strip_prefix(path)
        while path and path[0] == os.sep:
            path = path[1:]
        return os.path.join(self.localdir, path)

    def strip_prefix(self, path):
        return path[len(self.prefix):] if path.startswith(
            self.prefix) else path

    def strip_root(self, path):
        path = self.strip_prefix(path)
        if path.startswith(self.url):
            path = path[len(self.url):]
            if not path.startswith(self.localdir):
                raise 'Invalid hdfs path. It must start with the root directory'
        return path[len(self.localdir):] if path.startswith(
            self.localdir) else path

    def create_folder(self, path):
        try:
            path = self.normalize_path(path)
            self.client.makedirs(path)
        except:
            return None
        return path

    def remove(self, path):
        try:
            path = self.normalize_path(path)
            if self.client.status(path, False) is not None:
                self.client.delete(path, True)
        except Exception as e:
            print(e)

    def rename(self, oldpath, newpath):
        try:
            oldpath = self.normalize_path(oldpath)
            newpath = self.normalize_path(newpath)
            self.client.rename(oldpath, newpath)
        except Exception as e:
            print(e)

    def get_files(self, path):
        path = self.normalize_path(path)
        files = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] != "DIRECTORY":
                files.append(f)
        return files

    def get_folders(self, path):
        path = self.normalize_path(path)
        folders = []
        for f in self.client.list(path):
            status = self.client.status(join(path, f), False)
            if status['type'] == "DIRECTORY":
                folders.append(f)
        return folders

    def exists(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return not (status is None)

    def isdir(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "DIRECTORY"

    def isfile(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        return status['type'] == "FILE"

    def read(self, path):
        path = self.normalize_path(path)
        with self.client.read(path) as reader:
            return reader.read().decode('utf-8')

    def write(self, path, content):
        path = self.normalize_path(path)
        self.client.write(path, content)

    def make_json(self, path):
        normalized_path = self.normalize_path(path)
        data_json = {
            'path': urljoin(self.url, normalized_path),
            'text': os.path.basename(path)
        }
        status = self.client.status(normalized_path, False)

        if status is not None:
            data_json['folder'] = status['type'] == "DIRECTORY"
            if status['type'] == "DIRECTORY":
                data_json['nodes'] = [
                    self.make_json(os.path.join(path, fn))
                    for fn in self.client.list(normalized_path)
                ]
        #print(json.dumps(data_json))
        return data_json

    def save_upload(self, file, fullpath):
        localpath = os.path.join(tempfile.gettempdir(),
                                 os.path.basename(fullpath))
        if os.path.isfile(localpath):
            os.remove(localpath)
        try:
            file.save(localpath)
            if isfile(fullpath):
                fullpath = os.path.dirname(fullpath)
            self.client.upload(self.normalize_path(fullpath), localpath, True)
        except:
            pass

    def download(self, path):
        path = self.normalize_path(path)
        status = self.client.status(path, False)
        if status is not None and status['type'] == "FILE":
            localpath = os.path.join(tempfile.gettempdir(),
                                     os.path.basename(path))
            return self.client.download(path, localpath, True)
        else:
            return None

Example #6

Show file

File: part2.py Project: ryandhjeon/hadoop_hdfs_python


print('Begin')

# 1. Make a directory named: /activity1/
client.makedirs(hdfs_path='/activity1/', permission=None)
client.makedirs(hdfs_path='/activity1/data/', permission=None)

# 2. Put the file RandomText.txt into HDFS as the path: /activity1/data/RandomText.txt
client.upload(hdfs_path='/activity1/data/', local_path='./RandomText.txt')

# 3. List the contents of the directory /activity1/data/
print(client.list('/activity1/data'))

# 4. Move the HDFS file /activity1/data/RandomText.txt to /activity1/data/NotSoRandomText.txt
client.rename('/activity1/data/RandomText.txt',
              '/activity1/data/NotSoRandomText.txt')

with open('./RandomText.txt', 'r') as f:
    for line in f:
        temp = line

# 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt
client.write(hdfs_path='/activity1/data/NotSoRandomText.txt',
             data=temp,
             append=True)

# 6. List the disk space used by the directory /activity1/data/
diskSpaceUsed = client.content('/activity1/data/', strict=True)
print(diskSpaceUsed['spaceConsumed'])

# 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt

Example #7

Show file

File: api.py Project: kamelm2iformation/GitM2iDS2Gr1_Project

def start_service():
    global isRunning
    global stopRunning
    global currentFileToImport

    stopRunning = False
    isRunning = True

    while not stopRunning:

        # Connexion au cluster Cassandra
        cluster = Cluster(contact_points=['cassandra'], port=9042)
        session = cluster.connect()
        session.default_timeout = 10

        # Création de la base de données si elle n'existe pas
        replicationFactor = 2
        forceReplace = False
        db_name = "pjm"
        columnFamilyName = "estimated_load_hourly"
        create_database(session, db_name, columnFamilyName, replicationFactor,
                        forceReplace)

        try:
            # Chemin de téléchargement du fichier des données
            hdfs_file_path = "/user/root/data/pjm"

            # Connexion au client HDFS
            client = InsecureClient(url='http://namenode:9870', user='******')

            # Création du dossier de stockage des fichiers traités
            client.makedirs(hdfs_file_path + '/imported')

            # Récupération de la liste des fichiers à traiter
            files = client.list(hdfs_file_path, status=True)
            #print(files)
            #print(client.parts(hdfs_file_path))

            # Traitement des fichiers
            for pjm_file, filestatus in files:
                # Mise à jour du nom du fichier pour le status
                currentFileToImport = pjm_file
                print(pjm_file)

                if filestatus['type'] == 'FILE':
                    # Lecture en mémoire du fichier
                    with client.read(hdfs_file_path + "/" + pjm_file,
                                     encoding='utf-8') as reader:
                        df = pd.read_csv(reader, sep=',', header='infer')

                        # ----- Transformation initiale du Dataframe -----
                        # Suppression des colonnes inutiles
                        df = df.drop(columns=[
                            'datetime_beginning_ept', 'datetime_beginning_utc',
                            'datetime_ending_utc'
                        ])

                        # Suppression des observations en doublon => conservation de la 1e occurence
                        df = df.drop_duplicates(
                            subset=['datetime_ending_ept', 'load_area'],
                            keep='first')

                        # Formattage de la date de l'observation
                        df['datetime_measure'] = df.apply(
                            lambda x: pd.to_datetime(
                                x[['datetime_ending_ept']],
                                format='%m/%d/%Y %I:%M:%S %p'),
                            axis=1)
                        df['datetime_measure'] = df['datetime_measure'].astype(
                            'datetime64[ns]')
                        df = df.drop(columns=['datetime_ending_ept'])

                        # Import des données brutes dans Cassandra (table "estimated_load_hourly" )

                        #for i in df.index:
                        #request_insert = "INSERT INTO " + columnFamilyName + " (datetime_ending_ept, load_area, estimated_load) " \
                        #	+ " VALUES ('" + str(df['datetime_measure'][i]) + "','" + df['load_area'][i] + "', " + str(df['estimated_load_hourly'][i]) + ");"
                        #print(request_insert)
                        #session.execute(request_insert)

                        # ----- Nettoyage des données ---------
                        # Objectif : Conservation du total de la consommation par heure

                        # Suppression de la colonne de la zone
                        df = df.drop(columns=['load_area'])

                        # Calcul du total de consommation par heure
                        df = df.groupby(by=['datetime_measure']).sum()

                        # Ajout de colonnes complémentaires à la date de la mesure
                        df['date'] = df.index.date
                        df['annee'] = df.index.year
                        df['mois'] = df.index.month
                        df['semaine'] = df.index.isocalendar().week
                        df['heure'] = df.index.hour
                        df['jour_annee'] = df.index.dayofyear
                        df['trimestre'] = df.index.quarter
                        df['jour_semaine'] = df.index.dayofweek
                        df['jour_mois'] = df.index.day

                        # Ajout des très rares heures manquantes pour respecter la fréquence des observations
                        checkmonth = pd.DataFrame(
                            df.groupby(['jour_annee',
                                        'annee'])['annee'].count())
                        checkmonth.rename(columns={'annee': 'nb'},
                                          inplace=True)
                        df_missing = checkmonth[checkmonth.nb != 24]
                        df_missing = df_missing.reset_index()

                        Hour0_24 = pd.DataFrame(np.arange(24))
                        df_to_append = pd.DataFrame()

                        for x, y in zip(df_missing['jour_annee'],
                                        df_missing['annee']):
                            print("Jour avec des heures manquantes :", x, y)

                            df_encours = df[(df.jour_annee == x)
                                            & (df.annee == y)]
                            h_missing = Hour0_24[~Hour0_24[0].isin(df_encours.
                                                                   heure)]

                            h = h_missing.iloc[0].name
                            df_to_append = df_to_append.append(
                                df[(df.jour_annee == x) & (df.annee == y) &
                                   (df.heure == (h + 1))])

                        df_to_append.heure = df_to_append.heure - 1
                        df = df.append(df_to_append)

                        # Insertion des données nettoyées dans CASSANDRA (table "estimated_load_hourly_summary" )
                        for label, row in df.iterrows():
                            print(label)
                            request_insert = "INSERT INTO " + columnFamilyName + "_summary " \
                             + " (datetime_est_load " \
                             + " ,date_est_load " \
                             + " ,annee " \
                             + " ,mois " \
                             + " ,semaine " \
                             + " ,heure " \
                             + " ,trimestre " \
                             + " ,jour_annee " \
                             + " ,jour_semaine " \
                             + " ,jour_mois " \
                             + " ,total_estimated_load " \
                             + " ) " \
                             + " VALUES ('" + str(label) + "' " \
                             + " ,'" + str(row['date']) + "' " \
                             + " ," + str(row['annee']) + " " \
                             + " ," + str(row['mois']) + " " \
                             + " ," + str(row['semaine']) + " " \
                             + " ," + str(row['heure']) + " " \
                             + " ," + str(row['trimestre']) + " " \
                             + " ," + str(row['jour_annee']) + " " \
                             + " ," + str(row['jour_semaine']) + " " \
                             + " ," + str(row['jour_mois']) + " " \
                             + " ," + str(row['estimated_load_hourly']) + " );"

                            print(request_insert)
                            session.execute(request_insert)

                    # Déplacement du fichier traités dans le dossier "imported"
                    client.rename(hdfs_file_path + "/" + pjm_file,
                                  hdfs_file_path + '/imported/' + pjm_file)

        except HdfsError as ex:
            # Traitement des erreur HDFS
            print(str(ex))

        # Fin des fichiers à traiter
        currentFileToImport = ""

        #closing Cassandra connection
        session.shutdown()

        # Attendre avant de vérifier la présence d'autres fichiers
        time.sleep(30)

    return ""

Example #8

Show file

File: webhdfs.py Project: ukwa/ukwa-manage

class WebHDFSStore():
    '''
    A file store based on the WebHDFS protocol.
    '''
    # Set a refresh-date to indicate when we did this lookup:
    refresh_date = datetime.datetime.utcnow().isoformat(
        timespec='milliseconds') + 'Z'

    def __init__(self, service_id, user_override=None):
        self.service_id = service_id
        self.webhdfs_url = HADOOPS[service_id]['webhdfs_url']
        self.webhdfs_user = HADOOPS[service_id]['webhdfs_user']
        if user_override:
            self.webhdfs_user = user_override
        self.id_prefix = HADOOPS[service_id]['id_prefix']
        self.client = InsecureClient(self.webhdfs_url, self.webhdfs_user)

    def put(self, local_path, hdfs_path, backup_and_replace=False):
        # Get the status of the destination:
        dest_status = self.client.status(hdfs_path, strict=False)

        # Handle files or directories:
        if os.path.isfile(local_path):
            hdfs_path = self._combine_paths(dest_status, local_path, hdfs_path)
            self._upload_file(local_path, hdfs_path, backup_and_replace)
        elif os.path.isdir(local_path):
            # TODO, if it's a directory
            raise Exception(
                "Cannot upload anything other than single files at this time!")
        else:
            raise Exception("Unknown path type! Can't handle %s" % local_path)

    def _combine_paths(self, dest_status, local_path, hdfs_path):
        # If the hdfs_path is a directory, combine the paths:
        if dest_status and dest_status['type'] == 'DIRECTORY':
            combined_path = psp.join(hdfs_path, local_path)
            logger.info("Using combined path: %s" % combined_path)
            return combined_path
        else:
            # Otherwise, just return the path:
            return hdfs_path

    def _upload_file(self, local_path, hdfs_path, backup_and_replace=False):
        """
        Copy up to HDFS, making it suitably atomic by using a temporary filename during upload.

        :return: None
        """

        # Set up flag to record outcome:
        success = False

        # Calculate hash of local file:
        logger.info("Calculating hash of %s" % local_path)
        if not os.path.isfile(local_path):
            raise Exception("Cannot upload %s - individual files only!")
        local_hash = calculate_sha512_local(local_path)
        logger.info("Local %s hash is %s " % (local_path, local_hash))

        #
        # TODO Allow upload  to overwrite truncated files?
        #

        # Check if the destination file exists:
        already_exists = self.exists(hdfs_path)
        if already_exists and not backup_and_replace:
            logger.warning(
                "Path %s already exists! No upload will be attempted." %
                hdfs_path)
        else:
            # Upload to a temporary path:
            tmp_path = "%s_temp_" % hdfs_path

            # Now upload the file, allowing overwrites as this is a temporary file and
            # simultanous updates should not be possible:
            logger.info("Uploading as %s" % tmp_path)
            with open(local_path, 'rb') as reader, self.client.write(
                    tmp_path, overwrite=True) as writer:
                while True:
                    data = reader.read(10485760)
                    if not data:
                        break
                    writer.write(data)

            # If set, backup-and-replace as needed:
            if backup_and_replace and already_exists:
                date_stamp = datetime.datetime.now().strftime(
                    '%Y-%m-%d_%H-%M-%S')
                backup_path = "%s.bkp_%s" % (hdfs_path, date_stamp)
                logger.warning("Renaming %s to %s..." %
                               (hdfs_path, backup_path))
                self.client.rename(hdfs_path, backup_path)

            # Move the uploaded file into the right place:
            logger.info("Renaming %s to %s..." % (tmp_path, hdfs_path))
            self.client.rename(tmp_path, hdfs_path)

            # Give the namenode a moment to catch-up with itself and then check it's there:
            # FIXME I suspect this is only needed for our ancient HDFS
            time.sleep(2)
            status = self.client.status(hdfs_path)

        logger.info("Calculating hash of HDFS file %s" % hdfs_path)
        hdfs_hash = self.calculate_sha512(hdfs_path)
        logger.info("HDFS %s hash is %s " % (hdfs_path, hdfs_hash))
        if local_hash != hdfs_hash:
            raise Exception("Local & HDFS hashes do not match for %s" %
                            local_path)
        else:
            logger.info("Hashes are equal!")
            success = True

        # Log successful upload:
        logger.warning("Upload completed for %s" % hdfs_path)

        # And return success flag so caller knows it worked:
        return success

    def move(self, local_path, hdfs_path):
        # Perform the PUT first:
        success = self.put(local_path, hdfs_path)
        # And delete the local file if that worked:
        if success == True:
            os.remove(local_path)

    def calculate_sha512(self, path):
        '''
        Calculate the SHA512 hash of a single file on HDFS
        '''
        with self.client.read(path) as reader:
            file_hash = calculate_reader_hash(reader, path)

        return file_hash

    def _to_info(self, path, status):
        # Add the file path:
        status['file_path'] = path
        # Classify based on HDFS storage conventions:
        item = HdfsPathParser(status).to_dict()
        # Work out the permissions string:
        if status['permission'].isnumeric():
            permissions = permissions_octal_to_string(int(
                status['permission']))
            if status['type'] == 'DIRECTORY':
                permissions = "d" + permissions
            else:
                permissions = "-" + permissions
        else:
            permissions = status['permission']
        # Defined fields based on directory/file status
        if permissions[0] == 'd':
            fs_type = 'directory'
            access_url = '%s/webhdfs/v1%s?op=LISTSTATUS&user.name=%s' % (
                self.webhdfs_url, item['file_path'], self.webhdfs_user)
        else:
            fs_type = 'file'
            access_url = '%s/webhdfs/v1%s?op=OPEN&user.name=%s' % (
                self.webhdfs_url, item['file_path'], self.webhdfs_user)
        # And return as a 'standard' dict:
        return {
            'id': '%s%s' % (self.id_prefix, item['file_path']),
            'refresh_date_dt': self.refresh_date,
            'file_path_s': item['file_path'],
            'file_size_l': item['file_size'],
            'file_ext_s': item['file_ext'],
            'file_name_s': item['file_name'],
            'permissions_s': permissions,
            'hdfs_replicas_i': item['number_of_replicas'],
            'hdfs_user_s': item['user_id'],
            'hdfs_group_s': item['group_id'],
            'modified_at_dt': "%sZ" % item['modified_at'],
            'timestamp_dt': "%sZ" % item['timestamp'],
            'year_i': item['timestamp'][0:4],
            'recognised_b': item['recognised'],
            'kind_s': item['kind'],
            'collection_s': item['collection'],
            'stream_s': item['stream'],
            'job_s': item['job'],
            'layout_s': item['layout'],
            'hdfs_service_id_s': self.service_id,
            'hdfs_type_s': fs_type,
            'access_url_s': access_url
        }

    def list(self, path, recursive=False):
        # Handle non-existant entry, or a file:
        path_status = self.client.status(path, strict=False)
        if path_status is None:
            raise Exception("No such file or directory: %s" % path)
        elif path_status['type'] == 'FILE':
            # Plain old file:
            yield self._to_info(path, path_status)
        else:
            # Handle folders:
            if recursive:
                for dir_info, dirs_info, files_info in self.client.walk(
                        path, status=True):
                    dir_path, dir_status = dir_info
                    for file_name, file_status in files_info:
                        file_path = psp.join(dir_path, file_name)
                        yield self._to_info(file_path, file_status)
            else:
                for file_name, file_status in self.client.list(path,
                                                               status=True):
                    file_path = psp.join(path, file_name)
                    yield self._to_info(file_path, file_status)

    def exists(self, path):
        status = self.client.status(path, strict=False)
        if status:
            return True
        else:
            return False

    def rm(self, path):
        # And delete from HDFS (usually prevented by API proxy)
        # Hard-coded to never act recursively - if you want that, do it manually via the back-end.
        self.client.delete(path, recursive=False)

    def stream(self, path, offset=0, length=None):
        # NOTE our WebHDFS service is very old and uses 'len' not 'length' for controlling the response length:
        # The API proxy we use attempts to remedy this by mapping any 'length' parameter to 'len'.
        return self.client.read(path, offset=offset, length=length)

    def read(self, path, offset=0, length=None):
        with self.stream(path, offset, length) as reader:
            while True:
                data = reader.read(10485760)
                if not data:
                    break
                yield data

    def lsr_to_items(self, reader):
        """
        This task processes a raw list of files generated by the hadoop fs -lsr command.

        As this can be a very large list, it avoids reading it all into memory. It
        parses each line, and yields a suitable stream of parsed objects matching the WebHDFS API.
        """
        for line in reader:
            if "lsr: DEPRECATED: Please use 'ls -R' instead." in line:
                logger.warning(line)
            else:
                permissions, number_of_replicas, userid, groupid, filesize, modification_date, modification_time, filename = line.split(
                    None, 7)
                filename = filename.strip()
                timestamp = datetime.datetime.strptime(
                    '%s %s' % (modification_date, modification_time),
                    '%Y-%m-%d %H:%M')
                info = {
                    'permission': permissions,
                    'replication': number_of_replicas,
                    'owner': userid,
                    'group': groupid,
                    'length': filesize,
                    'modificationTime': timestamp.timestamp() * 1000,
                    'pathSuffix': filename
                }
                # Skip directories:
                if permissions[0] != 'd':
                    yield self._to_info(filename, info)
                    info['type'] = 'DIRECTORY'
                else:
                    info['type'] = 'FILE'