def getFileSystem(fs="dfs"): """ Returns a Hadoop FileSystem object, either "dfs" (default) or "local". """ if fs == "dfs": return FileSystem.get(happy.getJobConf()) elif fs == "local": return FileSystem.getLocal(happy.getJobConf()) else: raise Exception("Unknown filesystem " + fs)
def merge(path, dst): """ Merges files in a specified directory to a specified file. """ input = DatasetPath(happy.getJobConf(), path) output = DatasetPath(happy.getJobConf(), dst) input.copyTo(output)
def read(path): """ Returns a Python file-like object for a specified DFS file or directory. Merges files in a specified directory. """ # this is a hack because PyFile doesn't support Readers: return ReaderFile(DatasetPath(happy.getJobConf(), path).getReader())
def createCollector(path, fs="dfs", type="text", compressiontype="lzo", sequencetype="BLOCK"): """ Creates a type "text" (default) or "sequence" file collector at the specified path. Collectors are automatically closed at the end of the job. """ filesystem = getFileSystem(fs) datasetPath = DatasetPath(filesystem, path) datasetPath.deletePath() if type == "sequence": collector = TextSequenceFileCollector(filesystem, happy.getJobConf(), Path(path), _getSequenceFileType(sequencetype), _getCodecInstance(compressiontype)) elif type == "text": collector = TextFileCollector(filesystem, happy.getJobConf(), Path(path)) elif type == "bjson": collector = BJSONCollector(filesystem, happy.getJobConf(), Path(path), _getSequenceFileType(sequencetype), _getCodecInstance(compressiontype)) else: raise Exception("Unknown collector type " + type) # add as a closeable so that it is closed correctly: if happy.job is not None: happy.job.addCloseable(collector) return collector
def mktemp(name=None): """ Generate a directory path safe to use for temporary data. An optional name will be used to prefix the path for easier debugging. The path will be generated within the current hadoop.tmp.dir and will sort chronologically. """ path = happy.getJobConf().get("hadoop.tmp.dir") + "/" if name: path += str(name) + "-" path += "%.0f%i" % (time.time(), random.randint(0, 1E5)) return path
def rename(src, dst): """ Renames a DFS path. """ DatasetPath(happy.getJobConf(), src).rename(dst)
def copyFromLocal(localpath, path): """ Copies a local path to a DFS file. """ DatasetPath(happy.getJobConf(), path).copyFromLocal(localpath)
def copyToLocal(path, localpath): """ Copies a DFS path to a local file. Merges files in a specified directory. """ DatasetPath(happy.getJobConf(), path).copyToLocal(localpath)
def delete(path): """ Deletes a specified DFS path. """ DatasetPath(happy.getJobConf(), path).deletePath()
def grep(path, regex): """ Returns an iterator over lines in a path that contain a given regular expression. Uses the Java regex syntax. """ return StringIterator.getIterator(DatasetPath(happy.getJobConf(), path).grepLines(regex))
def write(path, compressiontype=None): """ Returns a Python file-like object for a specified DFS file. Uses a specified compression codec. """ return WriterFile(DatasetPath(happy.getJobConf(), path).getWriter(_getCodec(compressiontype)))
def openMapDir(path): """ Opens a MapDir map over a directory of MapFiles. """ return PyMapDir.openMapDir(getFileSystem(), path, happy.getJobConf())
def exists(path): """ Returns True if this path exists """ return DatasetPath(happy.getJobConf(), path).exists()
def fileStatus(path): """ Returns the org.apache.hadoop.fs.FileStatus object for this path """ return DatasetPath(happy.getJobConf(), path).getFileStatus()