def isfile(hdfs_path, project=None): """ Return True if path refers to a file. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). :project: If this value is not specified, it will get the path to your project. If you need to path to another project, you can specify the name of the project as a string. Returns: True if path refers to a file. Raises: IOError """ if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project) return path.isfile(hdfs_path)
def walk(self, parent_path, file_list, hdfs_parent_path, hdfs_file_list): print("%s %s" % (parent_path, hdfs_parent_path)) if len(file_list) == 0 and len(hdfs_file_list) == 0: if os.path.basename(parent_path) == path.basename( hdfs_parent_path): return True return False elif len(file_list) != len(hdfs_file_list): print("No match: number of files in dirs") return False else: file_list.sort( key=lambda f: os.path.isfile(os.path.join(parent_path, f))) hdfs_file_list.sort( key=lambda f: path.isfile(path.join(hdfs_parent_path, f))) hIdx = 0 for idx, sub_path in enumerate(file_list): full_path = os.path.join(parent_path, sub_path) hdfs_sub_path = hdfs_file_list[idx] hdfs_full_path = path.join(hdfs_parent_path, hdfs_sub_path) if (os.path.basename(sub_path) != path.basename(hdfs_sub_path)): print("No match: %s and %s" % (sub_path, hdfs_sub_path)) return False if os.path.isdir(full_path): if path.isdir(hdfs_full_path) == False: print("No match on directory: %s and %s" % (full_path, hdfs_full_path)) return False return self.walk(full_path, os.listdir(full_path), hdfs_full_path, hdfs.ls(hdfs_full_path)) elif os.path.isfile(full_path): sz = os.path.getsize(full_path) hdfs_size = path.getsize(hdfs_full_path) if (hdfs_size != sz): return False return True
def add_module(hdfs_path, project=None): """ Add a .py or .ipynb file from HDFS to sys.path For example, if you execute: >>> add_module("Resources/my_module.py") >>> add_module("Resources/my_notebook.ipynb") You can import it simply as: >>> import my_module >>> import my_notebook Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS) to a .py or .ipynb file Returns: Return full local path to localized python file or converted python file in case of .ipynb file """ localized_deps = os.getcwd() + "/localized_deps" if not os.path.exists(localized_deps): os.mkdir(localized_deps) open(localized_deps + '/__init__.py', mode='w').close() if localized_deps not in sys.path: sys.path.append(localized_deps) if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project) if path.isfile(hdfs_path) and hdfs_path.endswith('.py'): py_path = copy_to_local(hdfs_path, localized_deps, overwrite=True) if py_path not in sys.path: sys.path.append(py_path) return py_path elif path.isfile(hdfs_path) and hdfs_path.endswith('.ipynb'): ipynb_path = copy_to_local(hdfs_path, localized_deps, overwrite=True) python_path = os.environ['PYSPARK_PYTHON'] jupyter_binary = os.path.dirname(python_path) + '/jupyter' if not os.path.exists(jupyter_binary): raise Exception('Could not find jupyter binary on path {}'.format(jupyter_binary)) converted_py_path = os.path.splitext(ipynb_path)[0] + '.py' if os.path.exists(converted_py_path): os.remove(converted_py_path) conversion = subprocess.Popen([jupyter_binary, 'nbconvert', '--to', 'python', ipynb_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = conversion.communicate() if conversion.returncode != 0: raise Exception("Notebook conversion to .py failed: stdout: {} \n stderr: {}".format(out, err)) if not os.path.exists(converted_py_path): raise Exception('Could not find converted .py file on path {}'.format(converted_py_path)) if converted_py_path not in sys.path: sys.path.append(converted_py_path) return converted_py_path else: raise Exception("Given path " + hdfs_path + " does not point to a .py or .ipynb file")