Beispiel #1
0
 def chown(self):
     new_user = '******'
     test_path = self.hdfs_paths[0]
     hdfs.dump(self.data, test_path, mode="wb")
     hdfs.chown(test_path, user=new_user)
     path_info = hdfs.lsl(test_path)[0]
     self.assertEqual(path_info['owner'], new_user)
     prev_owner = path_info['owner']
     prev_grp = path_info['group']
     # owner and group should remain unchanged
     hdfs.chown(test_path, user='', group='')
     path_info = hdfs.lsl(test_path)[0]
     self.assertEqual(path_info['owner'], prev_owner)
     self.assertEqual(path_info['group'], prev_grp)
Beispiel #2
0
 def chown(self):
     new_user = '******'
     test_path = self.hdfs_paths[0]
     hdfs.dump(self.data, test_path)
     hdfs.chown(test_path, user=new_user)
     path_info = hdfs.lsl(test_path)[0]
     self.assertEqual(path_info['owner'], new_user)
     prev_owner = path_info['owner']
     prev_grp = path_info['group']
     # owner and group should remain unchanged
     hdfs.chown(test_path, user='', group='')
     path_info = hdfs.lsl(test_path)[0]
     self.assertEqual(path_info['owner'], prev_owner)
     self.assertEqual(path_info['group'], prev_grp)
Beispiel #3
0
 def initialize_break_points(cls,
                             n_reducers,
                             sampled_records,
                             input_dir,
                             n_threads=2):
     file_infos = [
         i for i in hdfs.lsl(input_dir)
         if (i['kind'] == 'file'
             and os.path.basename(i['name']).startswith('part'))
     ]
     n_files = len(file_infos)
     total_size = sum(map(lambda _: int(_['size']), file_infos))
     n_records = total_size // RECORD_LENGTH
     assert n_records > sampled_records
     df = max(n_files // n_reducers, 1)
     paths = [
         i['name'] for i in it.islice(file_infos, 0, df * n_reducers, df)
     ]
     break_points = cls.get_break_points(sampled_records // n_reducers,
                                         n_reducers, paths, n_threads)
     vals = [_ for _ in zip(break_points, range(1, n_reducers))]
     selector = Selector(vals)
     bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE)
     with io.open(bp_path, "wb") as f:
         f.write(srl.private_encode(selector))
     return bp_path
Beispiel #4
0
def list_images(input_dir):
    ret = []
    p = re.compile(r".*\.jpe?g$", re.IGNORECASE)
    ls = [_['name'] for _ in hdfs.lsl(input_dir) if _['kind'] == 'directory']
    for d in ls:
        ret.extend([_ for _ in hdfs.ls(d) if p.match(_)])
    LOGGER.info("%d classes, %d total images", len(ls), len(ret))
    return ret
Beispiel #5
0
def _get_samples_from_bcl_output(output_dir):
    def name_filter(n):
        bn = os.path.basename(n).lower()
        return not (bn.startswith('_') or ('unknown' in bn) or ('undetermined' in bn) )

    return [
            d['name'] for d in phdfs.lsl(output_dir)
            if d['kind'] == 'directory' and name_filter(d['name']) ]
def list_images(input_dir):
    rval = []
    logging.info("scanning %s", input_dir)
    for entry in hdfs.lsl(input_dir):
        if all(
            (entry["kind"] != "directory", not entry["name"].startswith("_"),
             entry["name"].endswith(".png"))):
            rval.append(entry["name"])
    logging.info("found %d images", len(rval))
    return rval
Beispiel #7
0
def _get_samples_from_bcl_output(output_dir):
    def name_filter(n):
        bn = os.path.basename(n).lower()
        return not (bn.startswith('_') or ('unknown' in bn) or
                    ('undetermined' in bn))

    return [
        d['name'] for d in phdfs.lsl(output_dir)
        if d['kind'] == 'directory' and name_filter(d['name'])
    ]
Beispiel #8
0
    def __init__(self, file_prefix, loadexist=False, readonly=False):
        CustomStorage.__init__(self)
        if not loadexist:
            if hdfs.path.exists('{0}_0'.format(file_prefix)):
                file_prefix += '_0'
            while hdfs.path.exists('{0}_0'.format(file_prefix)):
                insert_index = file_prefix.rfind('_')
                file_prefix = '{0}_{1}'.format(file_prefix[:insert_index], int(file_prefix[insert_index + 1:]) + 1)
        self.file_prefix = file_prefix
        self.read_only = readonly
        self.clear()
        logger.info('init hdfs storage from hdfs file_prefix {0}'.format(self.file_prefix))
        try:
            total_start = timeit.default_timer()
            prefix_split = hdfs.path.splitpath(self.file_prefix)
            folder_path = prefix_split[0]
            real_prefix = prefix_split[1] + '_'
            if not hdfs.path.exists(folder_path):
                hdfs.mkdir(folder_path)

            files_info = hdfs.lsl(folder_path)
            # files_info = hdfs.lsl('{0}_*'.format(self.file_prefix))
            logger.debug('files_info:{0}'.format(files_info))
            sizecount = 0
            for file_info in files_info:
                start_time = timeit.default_timer()
                file_name = hdfs.path.splitpath(file_info['path'])[1]
                if file_name.startswith(real_prefix) and file_info['kind'] == 'file':
                    logger.debug('file info: {0}'.format(file_info))
                    page_id = file_name[len(real_prefix):]
                    if not page_id.isdigit():
                        continue
                    logger.debug('file {0} page id :{1}#'.format(file_info['path'],
                                                                 page_id))
                    # if page_id.isdigit():
                    logger.info('load {0}# page file {1}'.format(page_id,
                                                                 file_info['path']))
                    content = hdfs.load(file_info['path'], mode='r')
                    # logger.debug('{0}# page content:{1}'.format(page_id, content))
                    self.pagedict[int(page_id)] = content
                    logger.debug('{0}# page load complete'.format(page_id))
                    end_time = timeit.default_timer()
                    eval(generate_timer_log_str.format(
                        'load {0} {1} byte'.format(file_name, len(self.pagedict[int(page_id)])),
                        start_time,
                        end_time))
                    sizecount += len(self.pagedict[int(page_id)])
        except IOError, ie:
            logger.debug(traceback.format_exc())
def upsert_a_folder(src_dir, hdfs_tgt_dir, filename, debug):
    src_fname = os.path.join(src_dir, filename)
    tgt_fname = os.path.join(hdfs_tgt_dir, filename)
    # get target file info
    tgt_dict = {}
    try:
        lsl = hdfs.lsl(hdfs_tgt_dir)
        for i in lsl:
            try:
                tgt_dict[os.path.basename(i["name"])] = i["last_mod"]
            except:
                pass
    except:
        pass
    print "hdfs tgt_dict=", tgt_dict

    # get source info
    src_fs = glob.glob(src_fname)
    print "src_fs=", src_fs
    for sf in src_fs:
        # get source file info
        try:
            src_ctime_int = int(os.path.getctime(sf))
        except:
            src_ctime_int = None
        print "src_ctime_int=", src_ctime_int

        src_bfname = os.path.basename(sf)
        tgt_fname = os.path.join(hdfs_tgt_dir, src_bfname)
        # put or rm/put
        try:
            if not src_bfname in tgt_dict:
                #insert new one
                if debug == 'N':
                    hdfs.put(sf, hdfs_tgt_dir)
                else:
                    print "DEBUG: put ", src_bfname, "to", hdfs_tgt_dir
            elif src_ctime_int > tgt_dict[src_bfname]:
                if debug == 'N':
                    hdfs.rmr(tgt_fname)
                    hdfs.put(sf, hdfs_tgt_dir)
                else:
                    print "DEBUG: replace ", tgt_fname, "by", sf
            else:
                print tgt_fname, "has a newer mdate than", sf, ":", src_ctime_int
        except:
            e = sys.exc_info()[0]
            print "Error: ", e
Beispiel #10
0
def lsl(hdfs_path, recursive=False, project=None):
    """
    Returns all the pathnames in the supplied directory.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS).
        :recursive: if it is a directory and recursive is True, the list contains one item for every file or directory in the tree rooted at hdfs_path.
        :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS.

    Returns:
        A possibly-empty list of path names stored in the supplied path.
    """
    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project)
    return hdfs.lsl(hdfs_path, recursive=recursive)
Beispiel #11
0
def clean_directory(dir, spam_life=spam_ttl):
    # Accepts a directory name and deletes anything older than TTL in days
    file_list = []

    # check the existance of the directory
    if hdfs.path.exists(dir):
        # get a list of all files there
        file_list = hdfs.lsl(dir)

    # loop through the file list
    for listing in file_list:
        # get the last access time of the file and compare to spam lifetime
        if time.time() - listing[
                'last_access'] > 86400 * spam_life:  # 86400 seconds in a day
            # if its too old delete it and log that it was deleted
            logger.info('Deleting ' + listing['name'])
            hdfs.rmr(listing['name'])
Beispiel #12
0
 def initialize_break_points(cls, n_reducers, sampled_records,
                             input_dir, n_threads=2):
     file_infos = [i for i in hdfs.lsl(input_dir)
                   if (i['kind'] == 'file' and
                       os.path.basename(i['name']).startswith('part'))]
     n_files = len(file_infos)
     total_size = sum(map(lambda _: int(_['size']), file_infos))
     n_records = total_size // RECORD_LENGTH
     assert n_records > sampled_records
     df = max(n_files // n_reducers, 1)
     paths = [i['name']
              for i in it.islice(file_infos, 0, df * n_reducers, df)]
     break_points = cls.get_break_points(sampled_records // n_reducers,
                                         n_reducers, paths, n_threads)
     vals = [_ for _ in zip(break_points, range(1, n_reducers))]
     selector = Selector(vals)
     bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE)
     with io.open(bp_path, "wb") as f:
         f.write(srl.private_encode(selector))
     return bp_path
def walk_remotely(remote_path):
    LOGGER.debug("Walking {0}".format(remote_path))
    inodes = hdfs.lsl(remote_path, recursive=True)
    return inodes
Beispiel #14
0
 def assign_labels(top_dir):
     classes = [
         hdfs.path.basename(_["name"]) for _ in hdfs.lsl(top_dir)
         if _["kind"] == "directory"
     ]
     return {c: i for i, c in enumerate(sorted(classes))}
Beispiel #15
0
def walk_remotely(remote_path):
    LOGGER.debug("Walking {0}".format(remote_path))
    inodes = hdfs.lsl(remote_path, recursive=True)
    return inodes