def parse(self, line): d = {} try: so = ScanfParser.parse(self, " ".join(line.split()), True) for attr, value in zip(self._attrs, so.ungrouped()): d[attr] = self._handlers[attr](attr, value) if attr in self._handlers else value except ScanfParser.ParseError as e: return {} return d
def parse(self, line): d = {} try: so = ScanfParser.parse(self, ' '.join(line.split()), True) for attr, value in zip(self._attrs, so.ungrouped()): d[attr] = self._handlers[attr](attr, value) if attr in self._handlers else value except ScanfParser.ParseError as e: return {} return d
def parse(self, line): d = {} try: so = ScanfParser.parse(self, ' '.join(line.split()), True) for attr, value in zip(self._attrs, so.ungrouped()): d[attr] = self._handlers[attr](attr, value) if attr in self._handlers else value except ScanfParser.ParseError as e: if log: log.error('ProcessHandleParser failed: %s' % e) return {} return d
def parse(self, line): d = {} try: so = ScanfParser.parse(self, ' '.join(line.split()), True) for attr, value in zip(self._attrs, so.ungrouped()): d[attr] = self._handlers[attr]( attr, value) if attr in self._handlers else value except ScanfParser.ParseError as e: if log: log.error('ProcessHandleParser failed: %s' % e) return {} return d
def __init__(self, attrs, type_map, handlers = {}): self._attrs = attrs self._handlers = handlers attr_list = map(type_map.get, attrs) ScanfParser.__init__(self, ' '.join(attr_list))
class HDFSHelper(object): """ This Class provides a set of functions for hdfs operations. NOTE: This class assumes a local hdfs or hadoop client on the path. """ class InternalError(Exception): pass PARSER = ScanfParser( '%(mode)s %(dirents)s %(user)s %(group)s %(filesize)d ' '%(year)d-%(month)d-%(day)d %(hour)d:%(minute)d') def __init__(self, config, command_class=CommandUtil, heap_limit=Amount(256, Data.MB), use_hadoop_v1=False): """ heap_limit is the maximum heap that should be allocated to the command process, defined using twitter.common.quantity.Data. use_hadoop_v1 sets the command to hadoop instead of hdfs. """ if not os.path.isdir(config): raise ValueError('Command requires root of a config tree') self._config = config self._cmd_class = command_class if heap_limit is None: raise ValueError( 'The hdfs heap_limit must not be specified as "None".') self._heap_limit = heap_limit self.cli_command = 'hdfs' if use_hadoop_v1: self.cli_command = 'hadoop' if self._cmd_class.execute_suppress_stdout_stderr( self.cli_command) != 0: raise OSError( 'The "{0}" utility is not available on the system PATH'.format( self.cli_command)) @property def config(self): return self._config def _call(self, cmd, *args, **kwargs): """Runs fs command with the given command and args. Checks the result of the call by default but this can be disabled with check=False. """ cmd = [self.cli_command, '--config', self._config, 'dfs', cmd ] + list(args) heapsize = str(int(self._heap_limit.as_(Data.MB))) with environment_as(HADOOP_HEAPSIZE=heapsize): if kwargs.get('check'): return self._cmd_class.check_call(cmd) elif kwargs.get('return_output'): return self._cmd_class.execute_and_get_output(cmd) elif kwargs.get('supress_output'): return self._cmd_class.execute_suppress_stdout(cmd) else: return self._cmd_class.execute(cmd) def get(self, src, dst): """ Copy file(s) in HDFS to local path (via proxy if necessary). NOTE: If src matches multiple files, make sure dst is a directory! """ if isinstance(src, list): hdfs_src = " ".join(src) else: hdfs_src = src return self._call('-get', hdfs_src, dst) def put(self, src, dst): """ Copy the local file src to a HDFS path dst. """ abs_src = os.path.expanduser(src) assert os.path.exists( abs_src), 'File does not exist, cannot copy: %s' % abs_src return self._do_put(abs_src, dst) def _do_put(self, source, dst): """ Put the local file in to HDFS """ if isinstance(dst, list): hdfs_dst = " ".join(dst) else: hdfs_dst = dst if not self._call('-test', '-e', hdfs_dst, check=False): self._call('-rm', '-skipTrash', hdfs_dst) return self._call('-put', source, hdfs_dst) def exists(self, path, flag='-e'): """ Checks if the path exists in HDFS Returns true if it exists or else Returns false """ try: return self._call("-test", flag, path) == 0 except subprocess.CalledProcessError: return False def cat(self, remote_file_pattern, local_file=sys.stdout): """ Cat HDFS file to local """ return self._call("-cat", remote_file_pattern, also_output_to_file=local_file) def _ls(self, path, is_dir=False, is_recursive=False): """ Return list of [hdfs_full_path, filesize] Raises exception when the HDFS ls command returns error """ hdfs_cmd = '-lsr' if is_recursive else '-ls' (exit_code, ls_result) = self._call(hdfs_cmd, path, return_output=True) if exit_code != 0: raise self.InternalError( "Error occurred. %s.Check logs for details" % ls_result) file_list = [] if ls_result is None: return file_list lines = ls_result.splitlines() for line in lines: if line == "" or line.startswith("Found"): continue seg = line.split(None, 7) if len(seg) < 8: raise self.InternalError("Invalid hdfs -ls output. [%s]" % line) filename = seg[-1] try: metadata = self.PARSER.parse(' '.join(seg[0:7])) except ScanfParser.ParseError as e: raise self.InternalError('Unable to parse hdfs output: %s' % e) #seg[0] example: drwxrwx--- if metadata.mode.startswith('d') != is_dir: continue file_list.append([filename, metadata.filesize]) return file_list def ls(self, path, is_dir=False): """ Returns list of [hdfs_full_path, filesize] If is_dir is true returns only the toplevel directories. """ return self._ls(path, is_dir, False) def lsr(self, path, is_dir=False): """ Returns list of [hdfs_full_path, filesize] in recursive manner If is_dir is true returns only the directories. """ return self._ls(path, is_dir, True) def read(self, filename): """ Return the contents of filename, or None if an error occurred. """ with temporary_file() as fp: os.unlink(fp.name) if self._call("-copyToLocal", filename, fp.name) == 0: with open(fp.name) as f: return f.read() else: return None def write(self, filename, text): """ Write will write the contents in the text to the filename given The file will be overwritten if it already exists """ self._call("-rm", filename) with temporary_file() as fp: fp.write(text) fp.flush() return self._call('-copyFromLocal', fp.name, filename) def mkdir(self, path): """ Mkdir will create a directory. If already present, it will return an error """ return self._call("-mkdir", path) def mkdir_suppress_err(self, path): """ Creates a directory if it does not exists """ if not self.exists(path): return self.mkdir(path) def rm(self, filename): """ Removes a file. """ return self._call("-rm", filename, suppress_output=True) def cp(self, src, dest): """ Copies a src file to dest """ return self._call("-cp", src, dest, suppress_output=True) def mv(self, src, dest): """ Move a src file to dest """ return self._call("-mv", src, dest, suppress_output=True) def copy_from_local(self, local, remote): """ Copies the file from local to remote """ return self._call("-copyFromLocal", local, remote, suppress_output=True) def copy_to_local(self, remote, local): """ Copies the file from remote to local """ return self._call("-copyToLocal", remote, local, suppress_output=True)
class ExecutorDetector(object): class Error(Exception): pass class CannotFindRoot(Error): pass LOG_PATH = 'executor_logs' RESOURCE_PATH = 'resource_usage.recordio' VARS_PATH = 'executor_vars.json' PATTERN = [ '%(root)s', 'slaves', '%(slave_id)s', 'frameworks', '%(framework_id)s', 'executors', '%(executor_id)s', 'runs', '%(run)s' ] EXTRACTOR = ScanfParser(os.path.join(*PATTERN)) @classmethod def find_root(cls, path): """Does this path appear to match the executor directory pattern?""" def root_from_path(path): path = os.path.normpath(path) path_vector = path.split(os.path.sep) pattern_vector = cls.PATTERN if len(path_vector) < len(pattern_vector): return None for pattern, path_component in zip(reversed(pattern_vector), reversed(path_vector)): if pattern.startswith('%'): continue if path_component != pattern: return None matched_path = os.path.join(*path_vector[-len(pattern_vector) + 1:]) return os.path.normpath(path[:-len(matched_path)]) while path != os.path.dirname(path): root = root_from_path(path) if root: return root path = os.path.dirname(path) @classmethod def match(cls, path): try: return cls.EXTRACTOR.parse(path) except ScanfParser.ParseError: return None @classmethod def path(cls, result): return os.path.join(*cls.PATTERN) % result.groups() @classmethod def find(cls, root, slave_id='*', framework_id='*', executor_id='*', run='*'): mixins = dict(root=root, slave_id=slave_id, framework_id=framework_id, executor_id=executor_id, run=run) return filter( None, map(cls.match, glob(os.path.join(*cls.PATTERN) % mixins))) def __init__(self, root=None): self.root = root or self.find_root(os.getcwd()) if self.root is None: raise self.CannotFindRoot('Not a valid executor root!') def __iter__(self): for extraction in self.find(root=self.root): yield extraction
class HDFSHelper(object): """ This Class provides a set of function for hadoop operations. """ class InternalError(Exception): pass PARSER = ScanfParser( '%(mode)s %(dirents)s %(user)s %(group)s %(filesize)d ' '%(year)d-%(month)d-%(day)d %(hour)d:%(minute)d') def __init__(self, config, command_class=CommandUtil): #Point to test hadoop cluster if no config given self._config = config self._cmd_class = command_class @property def config(self): return self._config def _call(self, cmd, *args, **kwargs): """Runs hadoop fs command with the given command and args. Checks the result of the call by default but this can be disabled with check=False. """ cmd = ['hadoop', '--config', self._config, 'dfs', cmd] + list(args) if kwargs.get('check'): return self._cmd_class.check_call(cmd) elif kwargs.get('return_output'): return self._cmd_class.execute_and_get_output(cmd) elif kwargs.get('supress_output'): return self._cmd_class.execute_suppress_stdout(cmd) else: return self._cmd_class.execute(cmd) def get(self, src, dst): """ Copy file(s) in hdfs to local path (via proxy if necessary). NOTE: If src matches multiple files, make sure dst is a directory! """ if isinstance(src, list): hdfs_src = " ".join(src) else: hdfs_src = src return self._call('-get', hdfs_src, dst) def put(self, src, dst): """ Copy the local file src to a hadoop path dst. """ abs_src = os.path.expanduser(src) assert os.path.exists( abs_src), 'File does not exist, cannot copy: %s' % abs_src return self._do_put(abs_src, dst) def _do_put(self, source, dst): """ Put the local file in to HDFS """ if isinstance(dst, list): hdfs_dst = " ".join(dst) else: hdfs_dst = dst if not self._call('-test', '-e', hdfs_dst, check=False): self._call('-rm', '-skipTrash', hdfs_dst) return self._call('-put', source, hdfs_dst) def exists(self, path, flag='-e'): """ Checks if the path exists in hdfs Returns true if it exists or else Returns false """ try: return self._call("-test", flag, path) == 0 except subprocess.CalledProcessError: return False def cat(self, remote_file_pattern, local_file=sys.stdout): """ Cat hdfs file to local """ return self._call("-cat", remote_file_pattern, also_output_to_file=local_file) def _ls(self, path, is_dir=False, is_recursive=False): """ Return list of [hdfs_full_path, filesize] Raises exception when the hadoop ls command returns error """ hdfs_cmd = '-lsr' if is_recursive else '-ls' (exit_code, ls_result) = self._call(hdfs_cmd, path, return_output=True) if exit_code != 0: raise self.InternalError( "Error occurred. %s.Check logs for details" % ls_result) file_list = [] if ls_result == None: return file_list lines = ls_result.splitlines() for line in lines: if line == "" or line.startswith("Found"): continue seg = line.split(None, 7) if len(seg) < 8: raise self.InternalError("Invalid hdfs -ls output. [%s]" % line) filename = seg[-1] try: metadata = self.PARSER.parse(' '.join(seg[0:7])) except ScanfParser.ParseError as e: raise self.InternalError('Unable to parse hdfs output: %s' % e) #seg[0] example: drwxrwx--- if metadata.mode.startswith('d') != is_dir: continue file_list.append([filename, metadata.filesize]) return file_list def ls(self, path, is_dir=False): """ Returns list of [hdfs_full_path, filesize] If is_dir is true returns only the toplevel directories. """ return self._ls(path, is_dir, False) def lsr(self, path, is_dir=False): """ Returns list of [hdfs_full_path, filesize] in recursive manner If is_dir is true returns only the directories. """ return self._ls(path, is_dir, True) def read(self, filename): """ Read will return the contents of the file in a variable """ tmp_file = tempfile.mktemp() if self._call("-copyToLocal", filename, tmp_file) == 0: with open(tmp_file, "r") as f: text = f.read() else: text = None return text def write(self, filename, text): """ Write will write the contents in the text to the filename given The file will be overwritten if it already exists """ self._call("-rm", filename) with temporary_file() as fp: fp.write(text) fp.flush() te = self._call('-copyFromLocal', fp.name, filename) print "sel", te return te return self._call('-copyFromLocal', fp.name, filename) def mkdir(self, path): """ Mkdir will create a directory. If already present, it will return an error """ return self._call("-mkdir", path) def mkdir_suppress_err(self, path): """ Creates a directory if it does not exists """ if (not self.exists(path)): return self.mkdir(path) def rm(self, filename): """ Removes a file. """ return self._call("-rm", filename, suppress_output=True) def cp(self, src, dest): """ Copies a src file to dest """ return self._call("-cp", src, dest, suppress_output=True) def copy_from_local(self, local, remote): """ Copies the file from local to remote """ return self._call("-copyFromLocal", local, remote, suppress_output=True) def copy_to_local(self, remote, local): """ Copies the file from remote to local """ return self._call("-copyToLocal", remote, local, suppress_output=True)
def __init__(self, attrs, type_map, handlers = {}): self._attrs = attrs self._handlers = handlers attr_list = map(type_map.get, attrs) ScanfParser.__init__(self, ' '.join(attr_list))