Ejemplo n.º 1
0
    def walk(self, parent_path, file_list, hdfs_parent_path, hdfs_file_list):
        print("%s %s" % (parent_path, hdfs_parent_path))
        if len(file_list) == 0 and len(hdfs_file_list) == 0:
            if os.path.basename(parent_path) == path.basename(
                    hdfs_parent_path):
                return True
            return False
        elif len(file_list) != len(hdfs_file_list):
            print("No match: number of files in dirs")
            return False
        else:
            file_list.sort(
                key=lambda f: os.path.isfile(os.path.join(parent_path, f)))
            hdfs_file_list.sort(
                key=lambda f: path.isfile(path.join(hdfs_parent_path, f)))
            hIdx = 0
            for idx, sub_path in enumerate(file_list):
                full_path = os.path.join(parent_path, sub_path)
                hdfs_sub_path = hdfs_file_list[idx]
                hdfs_full_path = path.join(hdfs_parent_path, hdfs_sub_path)

                if (os.path.basename(sub_path) !=
                        path.basename(hdfs_sub_path)):
                    print("No match: %s and %s" % (sub_path, hdfs_sub_path))
                    return False

                if os.path.isdir(full_path):
                    if path.isdir(hdfs_full_path) == False:
                        print("No match on directory: %s and %s" %
                              (full_path, hdfs_full_path))
                        return False
                    return self.walk(full_path, os.listdir(full_path),
                                     hdfs_full_path, hdfs.ls(hdfs_full_path))
                elif os.path.isfile(full_path):
                    sz = os.path.getsize(full_path)
                    hdfs_size = path.getsize(hdfs_full_path)
                    if (hdfs_size != sz):
                        return False

        return True
Ejemplo n.º 2
0
def _is_same_directory(local_path, hdfs_path):
    """
    Validates that the same occurrence and names of files exists in both hdfs and local
    """
    local_file_list = []
    for root, dirnames, filenames in os.walk(local_path):
        for filename in fnmatch.filter(filenames, '*'):
            local_file_list.append(filename)
        for dirname in fnmatch.filter(dirnames, '*'):
            local_file_list.append(dirname)
    local_file_list.sort()

    hdfs_file_list = glob(hdfs_path + '/*', recursive=True)
    hdfs_file_list = [path.basename(str(r)) for r in hdfs_file_list]
    hdfs_file_list.sort()

    if local_file_list == hdfs_file_list:
        return True
    else:
        return False
Ejemplo n.º 3
0
def copy_to_local(hdfs_path, local_path="", overwrite=False, project=None):
    """
    Copies a directory or file from a HDFS project to a local private scratch directory. If there is not enough space on the local scratch directory, an exception is thrown.
    If the local file exists, and the hdfs file and the local file are the same size in bytes, return 'ok' immediately.
    If the local directory tree exists, and the hdfs subdirectory and the local subdirectory have the same files and directories, return 'ok' immediately.

    For example, if you execute:

    >>> copy_to_local("Resources/my_data")

    This will copy the directory my_data from the Resources dataset in your project to the current working directory on the path ./my_data

    Raises:
      IOError if there is not enough space to localize the file/directory in HDFS to the scratch directory ($PDIR)

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :local_path: the relative or full path to a directory on the local filesystem to copy to (relative to a scratch directory $PDIR), defaults to $CWD
        :overwrite: a boolean flag whether to overwrite if the path already exists in the local scratch directory.
        :project: name of the project, defaults to the current HDFS user's project

    Returns:
        the full local pathname of the file/dir
    """

    if project == None:
        project = project_name()

    if local_path.startswith(os.getcwd()):
        local_dir = local_path
    else:
        local_dir = os.getcwd() + '/' + local_path

    if not os.path.isdir(local_dir):
        raise IOError("You need to supply the path to a local directory. This is not a local dir: %s" % local_dir)

    filename = path.basename(hdfs_path)
    full_local = local_dir + "/" + filename

    project_hdfs_path = _expand_path(hdfs_path, project=project)

    # Get the amount of free space on the local drive
    stat = os.statvfs(local_dir)
    free_space_bytes = stat.f_bsize * stat.f_bavail

    hdfs_size = path.getsize(project_hdfs_path)

    if os.path.isfile(full_local) and not overwrite:
        sz = os.path.getsize(full_local)
        if hdfs_size == sz:
            print("File " + project_hdfs_path + " is already localized, skipping download...")
            return full_local
        else:
            os.remove(full_local)

    if os.path.isdir(full_local) and not overwrite:
        try:
            localized = _is_same_directory(full_local, project_hdfs_path)
            if localized:
                print("Full directory subtree already on local disk and unchanged. Set overwrite=True to force download")
                return full_local
            else:
                shutil.rmtree(full_local)
        except Exception as e:
            print("Failed while checking directory structure to avoid re-downloading dataset, falling back to downloading")
            print(e)
            shutil.rmtree(full_local)

    if hdfs_size > free_space_bytes:
        raise IOError("Not enough local free space available on scratch directory: %s" % local_path)

    if overwrite:
        if os.path.isdir(full_local):
            shutil.rmtree(full_local)
        elif os.path.isfile(full_local):
            os.remove(full_local)

    print("Started copying " + project_hdfs_path + " to local disk on path " + local_dir + "\n")

    hdfs.get(project_hdfs_path, local_dir)

    print("Finished copying\n")

    return full_local
Ejemplo n.º 4
0
def copy_to_local(hdfs_path, local_path, overwrite=False, project=None):
    """
    Copies a directory or file from a HDFS project to a local private scratch directory. If there is not enough space on the local scratch directory, an exception is thrown.
    If the local file exists, and the hdfs file and the local file are the same size in bytes, return 'ok' immediately.
    If the local directory tree exists, and the hdfs subdirectory and the local subdirectory have the same files and directories, and the files are the same size in bytes, return 'ok' immediately.

    Raises:
      IOError if there is not enough space to localize the file/directory in HDFS to the scratch directory ($PDIR)

    Args:
        :local_path: the relative or full path to a directory on the local filesystem to copy to (relative to a scratch directory $PDIR)
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :overwrite: a boolean flag whether to overwrite if the path already exists in the local scratch directory.
        :project: name of the project, defaults to the current HDFS user's project

    Returns:
        the full local pathname of the file/dir
    """

    if project == None:
        project = project_name()

    if "PDIR" in os.environ:
        local_dir = os.environ['PDIR'] + '/' + local_path
    else:
        local_dir = os.getcwd() + '/' + local_path

    if os.path.isdir(local_dir) == False:
        raise IOError(
            "You need to supply the path to a local directory. This is not a local dir: %s"
            % local_dir)

    filename = path.basename(hdfs_path)
    full_local = local_dir + "/" + filename

    project_hdfs_path = _expand_path(hdfs_path, project=project)
    sub_path = hdfs_path.find("hdfs:///Projects/" + project)
    rel_path = hdfs_path[sub_path + 1:]

    # Get the amount of free space on the local drive
    stat = os.statvfs(local_dir)
    free_space_bytes = stat.f_bsize * stat.f_bavail

    hdfs_size = path.getsize(project_hdfs_path)

    if os.path.isfile(full_local) and overwrite == False:
        sz = os.path.getsize(full_local)
        if (hdfs_size == sz):
            return full_local

    if os.path.isdir(full_local) and overwrite == False:
        if FsTree().check(full_local, project_hdfs_path) == True:
            print(
                "Full directory subtree already on local disk and unchanged.")
            return full_local

    if (hdfs_size > free_space_bytes):
        raise IOError(
            "Not enough local free space available on scratch directory: %s" %
            path)

    if overwrite:
        if os.path.isdir(full_local):
            shutil.rmtree(full_local)
        elif os.path.isfile(full_local):
            os.remove(full_local)

    hdfs.get(project_hdfs_path, local_dir)

    return full_local