Beispiel #1
0
 def run(self):
     log.info('initiating snakebite hdfs client')
     try:
         client = AutoConfigClient()
     except krbV.Krb5Error as _:  # pylint: disable=no-member
         if self.verbose:
             print('', file=sys.stderr)
         print(_, file=sys.stderr)
     start_time = time.time()
     dir_count = 0
     file_count = 0
     repl1_count = 0
     for path in self.path_list:
         try:
             result_list = client.ls([path],
                                     recurse=True,
                                     include_toplevel=True,
                                     include_children=True)
             for result in result_list:
                 if self.verbose and (dir_count + file_count) % 100 == 0:
                     print('.', file=sys.stderr, end='')
                 if result['block_replication'] == 0:
                     dir_count += 1
                     continue
                 file_count += 1
                 if result['block_replication'] == 1:
                     file_path = result['path']
                     repl1_count += 1
                     if self.verbose:
                         print('', file=sys.stderr)
                     print(file_path)
                     if self.replication_factor:
                         log.info('setting replication factor to %s on %s',
                                  self.replication_factor, file_path)
                         # returns a generator so must evaluate in order to actually execute
                         # otherwise you find there is no effect on the replication factor
                         for _ in client.setrep([file_path],
                                                self.replication_factor,
                                                recurse=False):
                             if 'result' not in _:
                                 print(
                                     'WARNING: result field not found in setrep result: {}'
                                     .format(_),
                                     file=sys.stderr)
                                 continue
                             if not _['result']:
                                 print(
                                     'WARNING: failed to setrep: {}'.format(
                                         _))
         except (snakebite.errors.FileNotFoundException,
                 snakebite.errors.RequestError) as _:
             if self.verbose:
                 print('', file=sys.stderr)
             print(_, file=sys.stderr)
     if self.verbose:
         print('', file=sys.stderr)
     secs = int(time.time() - start_time)
     print('\nCompleted in {} secs\n'.format(secs), file=sys.stderr)
     print('{} files with replication factor 1 out of {} files in {} dirs'\
           .format(repl1_count, file_count, dir_count), file=sys.stderr)
Beispiel #2
0
class HdfsFileManager(FileManagerBase):
    """A wrapper of snakebite client."""

    def can_handle(self, path):
        return path.startswith('hdfs://')

    def __init__(self):
        self._client = AutoConfigClient()

    def ls(self, path: str, recursive=False) -> List[File]:
        files = []
        for file in self._client.ls([path], recurse=recursive):
            if file['file_type'] == 'f':
                files.append(File(
                    path=file['path'],
                    size=file['length']))
        return files

    def move(self, source: str, destination: str) -> bool:
        return len(list(self._client.rename([source], destination))) > 0

    def remove(self, path: str) -> bool:
        return len(list(self._client.delete([path]))) > 0

    def copy(self, source: str, destination: str) -> bool:
        # TODO
        raise NotImplementedError()

    def mkdir(self, path: str) -> bool:
        return next(self._client.mkdir([path], create_parent=True))\
            .get('result')
def main():
    hadoop_conf_dir = "/media/d2/code-sky/dockers/hadoop/etc/hadoop"
    os.environ['HADOOP_CONF_DIR'] = hadoop_conf_dir

    file_dict = {}

    cli = AutoConfigClient()
    target_hdfs_path = "/"

    for element in cli.ls([target_hdfs_path]):
        print("Result: " + str(element))
def ls(hdfs_path, recurse=False, include_toplevel=True, include_children=False):
    """
    Parameters:
    paths (list) : Paths to list
    recurse (boolean) : Recursive listing
    include_toplevel (boolean) : Include the given path in the listing. If the path is a file, include_toplevel is always True.
    include_children (boolean) : Include child nodes in the listing.
    Returns:
    (list) path listings with attributes
    """
    client = AutoConfigClient()

    path_info = list(client.ls([hdfs_path], recurse, include_toplevel, include_children))

    return LsObject(path_info)
Beispiel #5
0
class HdfsFileManager(FileManagerBase):
    """A wrapper of snakebite client."""
    def can_handle(self, path):
        return path.startswith('hdfs://')

    def __init__(self):
        self._client = AutoConfigClient()

    def ls(self, path: str, recursive=False) -> List[str]:
        files = []
        for file in self._client.ls([path], recurse=recursive):
            if file['file_type'] == 'f':
                files.append(file['path'])
        return files

    def move(self, source: str, destination: str) -> bool:
        return len(list(self._client.rename([source], destination))) > 0

    def remove(self, path: str) -> bool:
        return len(list(self._client.delete([path]))) > 0
    # don't nuke this; hbase uses it for bulk loading.
    re.compile("^/tmp/hbase-staging/?"),

    # let's try to make sure we're not matching against a top-level path
    re.compile("^/[-_.a-zA-Z0-9]+/?$"),

    re.compile("cloudera_health_monitoring_canary_files"),

    # let's bail out explicitly on anything in our data path
    re.compile("^/data/production/?"),
]


if client.test(args.path, exists=True):
    for x in client.ls([args.path], recurse=args.recurse_filesystem):
        if any(regex.search(x['path']) for regex in donotdelete_whitelist):
            logger.info("Matched banned thing, not attempting to delete it: %s", x['path'])
        else:
            f_timestamp = datetime.datetime.fromtimestamp(x['modification_time']/1000)
            if  f_timestamp < older_than:
                logger.info("I might delete this: %s %s", x['path'], f_timestamp)
                if args.actually_delete:
                    logger.info("Issuing delete of %s", list(client.delete([x['path']], recurse=True)))
                    if client.test(x['path'], exists=True):
                        logger.info("Removed %s", x['path'])
                else:
                    logger.info( "I would have deleted this: %s ", x['path'])
else:
    logger.warn("%s is not found on hdfs", args.path)