def write(idir, odir, remove, check, verbose): "Write files from given input area into HDFS" if not os.path.isdir(idir): print("Source area %s does not exists" % idir) sys.exit(1) if not hdfs.path.isdir(odir): print("Destination area on HDFS %s does not exists" % odir) print("Create it first with the following command") print("hadoop fs -mkdir %s" % odir) sys.exit(1) for name in os.listdir(idir): fname = os.path.join(idir, name) if not (name.endswith('.avro') or \ name.endswith('.avro.gz') or \ name.endswith('.avro.bz2')): if verbose: print("Skip %s" % fname) continue oname = hdfs_file(odir, name) if not hdfs.path.isfile(oname): if verbose: print("Migrate %s to %s" % (fname, oname)) hdfs.put(fname, oname) if check: fsize = os.stat(fname).st_size osize = hdfs.stat(oname).st_size if fsize != osize: print("Size %s (%s) != %s (%s)" % (fname, fsize, oname, osize)) sys.exit(1) if remove: os.remove(fname)
def _create_directories(app_id, run_id, param_string, type, sub_type=None): """ Creates directories for an experiment, if Experiments folder exists it will create directories below it, otherwise it will create them in the Logs directory. Args: :app_id: YARN application ID of the experiment :run_id: Experiment ID :param_string: name of the new directory created under parent directories :type: type of the new directory parent, e.g differential_evolution :sub_type: type of sub directory to parent, e.g generation Returns: The new directories for the yarn-application and for the execution (hdfs_exec_logdir, hdfs_appid_logdir) """ pyhdfs_handle = get() if pyhdfs_handle.exists(project_path() + "Experiments"): hdfs_events_parent_dir = project_path() + "Experiments" elif pyhdfs_handle.exists(project_path() + "Logs"): hdfs_events_parent_dir = project_path() + "Logs/TensorFlow" try: st = hdfs.stat(hdfs_events_parent_dir) if not bool(st.st_mode & local_stat.S_IWGRP ): # if not group writable make it so hdfs.chmod(hdfs_events_parent_dir, "g+w") except IOError: # If this happens then the permission is set correct already since the creator of the /Logs/TensorFlow already set group writable pass hdfs_appid_logdir = hdfs_events_parent_dir + "/" + app_id # if not pyhdfs_handle.exists(hdfs_appid_logdir): # pyhdfs_handle.create_directory(hdfs_appid_logdir) hdfs_run_id_logdir = hdfs_appid_logdir + "/" + type + "/run." + str(run_id) # determine directory structure based on arguments if sub_type: hdfs_exec_logdir = hdfs_run_id_logdir + "/" + str( sub_type) + '/' + str(param_string) elif not param_string and not sub_type: hdfs_exec_logdir = hdfs_run_id_logdir + '/' else: hdfs_exec_logdir = hdfs_run_id_logdir + '/' + str(param_string) # Need to remove directory if it exists (might be a task retry) if pyhdfs_handle.exists(hdfs_exec_logdir): pyhdfs_handle.delete(hdfs_exec_logdir, recursive=True) # create the new directory pyhdfs_handle.create_directory(hdfs_exec_logdir) # update logfile logfile = hdfs_exec_logdir + '/' + 'logfile' os.environ['EXEC_LOGFILE'] = logfile return hdfs_exec_logdir, hdfs_appid_logdir
def stat(hdfs_path): """ Performs the equivalent of os.stat() on hdfs_path, returning a StatResult object. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). Returns: returns a list of hdfs paths """ hdfs_path = _expand_path(hdfs_path) return hdfs.stat(hdfs_path)
def stat(hdfs_path, project=None): """ Performs the equivalent of os.stat() on path, returning a StatResult object. Args: :hdfs_path: If this value is not specified, it will get the path to your project. You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). :project: If this value is not specified, it will get the path to your project. If you need to path to another project, you can specify the name of the project as a string. Returns: StatResult object """ if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project) return hdfs.stat(hdfs_path)
def chown(self, path, uid, gid): '''Filter for unchanged uids and gids (-1)''' st = hdfs.stat(path) if uid == -1: if gid == -1: hdfs.chown(path) else: group= self._G2h_g(gid) hdfs.chown(path,group=group) else: user = self._U2h_u(uid) if gid == -1: hdfs.chown(path,user) else: group= self._G2h_g(gid) hdfs.chown(path, user, group)
def getattr(self, path, fh=None): if not hdfs.path.exists(path): raise FuseOSError(errno.ENOENT) st = hdfs.stat(path) data={} data['st_atime']=st.st_atime data['st_ctime']=st.st_ctime data['st_gid']=self._h_g2G(st.st_gid) mode = st.st_mode if st.kind.lower() == 'directory': mode = mode + 16384 if st.kind.lower() == 'file': mode = mode + 32768 data['st_mode']=mode data['st_mtime']=st.st_mtime data['st_nlink']=st.st_nlink data['st_size']=st.st_size data['st_uid']=self._h_u2U(st.st_uid) return data
def upsert_a_file(src_dir, hdfs_tgt_dir, filename, debug): src_fname = os.path.join(src_dir, filename) tgt_fname = os.path.join(hdfs_tgt_dir, filename) # get source file info try: src_ctime_int = int(os.path.getctime(src_fname)) except: src_ctime_int = None print "src_ctime_int=", src_ctime_int # get target file info try: tgt_stat = hdfs.stat(tgt_fname) tgt_mtime = tgt_stat.st_mtime except: tgt_mtime = None print "tgt_mtime=", tgt_mtime # put or rm/put try: if tgt_mtime is None: #insert new one if debug == 'N': hdfs.put(src_fname, hdfs_tgt_dir) else: print "DEBUG: put ", src_fname, "to", hdfs_tgt_dir elif src_ctime_int > tgt_mtime: if debug == 'N': hdfs.rmr(tgt_fname) hdfs.put(src_fname, hdfs_tgt_dir) else: print "DEBUG: replace ", tgt_fname, "by", src_fname else: print tgt_fname, "has a newer mdate:", tgt_mtime, "than", src_fname, ":", src_ctime_int except: e = sys.exc_info()[0] print "Error({0}): {1}".format(e.errno, e.strerror)