Beispiel #1
0
def isfile(hdfs_path, project=None):
    """
    Return True if path refers to a file.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :project: If this value is not specified, it will get the path to your project. If you need to path to another project, you can specify the name of the project as a string.

    Returns:
        True if path refers to a file.

    Raises: IOError
    """
    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project)
    return path.isfile(hdfs_path)
Beispiel #2
0
    def walk(self, parent_path, file_list, hdfs_parent_path, hdfs_file_list):
        print("%s %s" % (parent_path, hdfs_parent_path))
        if len(file_list) == 0 and len(hdfs_file_list) == 0:
            if os.path.basename(parent_path) == path.basename(
                    hdfs_parent_path):
                return True
            return False
        elif len(file_list) != len(hdfs_file_list):
            print("No match: number of files in dirs")
            return False
        else:
            file_list.sort(
                key=lambda f: os.path.isfile(os.path.join(parent_path, f)))
            hdfs_file_list.sort(
                key=lambda f: path.isfile(path.join(hdfs_parent_path, f)))
            hIdx = 0
            for idx, sub_path in enumerate(file_list):
                full_path = os.path.join(parent_path, sub_path)
                hdfs_sub_path = hdfs_file_list[idx]
                hdfs_full_path = path.join(hdfs_parent_path, hdfs_sub_path)

                if (os.path.basename(sub_path) !=
                        path.basename(hdfs_sub_path)):
                    print("No match: %s and %s" % (sub_path, hdfs_sub_path))
                    return False

                if os.path.isdir(full_path):
                    if path.isdir(hdfs_full_path) == False:
                        print("No match on directory: %s and %s" %
                              (full_path, hdfs_full_path))
                        return False
                    return self.walk(full_path, os.listdir(full_path),
                                     hdfs_full_path, hdfs.ls(hdfs_full_path))
                elif os.path.isfile(full_path):
                    sz = os.path.getsize(full_path)
                    hdfs_size = path.getsize(hdfs_full_path)
                    if (hdfs_size != sz):
                        return False

        return True
Beispiel #3
0
def add_module(hdfs_path, project=None):
    """
     Add a .py or .ipynb file from HDFS to sys.path

     For example, if you execute:

     >>> add_module("Resources/my_module.py")
     >>> add_module("Resources/my_notebook.ipynb")

     You can import it simply as:

     >>> import my_module
     >>> import my_notebook

     Args:
         :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS) to a .py or .ipynb file

     Returns:
        Return full local path to localized python file or converted python file in case of .ipynb file
    """

    localized_deps = os.getcwd() + "/localized_deps"
    if not os.path.exists(localized_deps):
        os.mkdir(localized_deps)
        open(localized_deps + '/__init__.py', mode='w').close()

    if localized_deps not in sys.path:
        sys.path.append(localized_deps)

    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project)

    if path.isfile(hdfs_path) and hdfs_path.endswith('.py'):
        py_path = copy_to_local(hdfs_path, localized_deps, overwrite=True)
        if py_path not in sys.path:
            sys.path.append(py_path)
        return py_path
    elif path.isfile(hdfs_path) and hdfs_path.endswith('.ipynb'):
        ipynb_path = copy_to_local(hdfs_path, localized_deps, overwrite=True)
        python_path = os.environ['PYSPARK_PYTHON']
        jupyter_binary = os.path.dirname(python_path) + '/jupyter'
        if not os.path.exists(jupyter_binary):
            raise Exception('Could not find jupyter binary on path {}'.format(jupyter_binary))

        converted_py_path = os.path.splitext(ipynb_path)[0] + '.py'
        if os.path.exists(converted_py_path):
            os.remove(converted_py_path)

        conversion = subprocess.Popen([jupyter_binary, 'nbconvert', '--to', 'python', ipynb_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out, err = conversion.communicate()
        if conversion.returncode != 0:
            raise Exception("Notebook conversion to .py failed: stdout: {} \n stderr: {}".format(out, err))

        if not os.path.exists(converted_py_path):
            raise Exception('Could not find converted .py file on path {}'.format(converted_py_path))
        if converted_py_path not in sys.path:
            sys.path.append(converted_py_path)
        return converted_py_path
    else:
        raise Exception("Given path " + hdfs_path + " does not point to a .py or .ipynb file")