def glob(file_pattern): """ @depreciated Now I only find pyhdfs.hdfsListDirectory, @TODO support hdfsGlob """ if file_pattern.startswith(HDFS_START): handle = get_handle() # hdfs:// return pyhdfs.hdfsListDirectory(handle, file_pattern) elif file_pattern.startswith(START_DIR): handle = get_handle() # /app/tuku/.. result, num = pyhdfs.hdfsListDirectory(handle, fullpath(file_pattern)) return [ item.mName for item in [pyhdfs.hdfsFileInfo_getitem(result, i) for i in xrange(num)] if item.mSize > 0 ] return local_glob.glob(file_pattern)
def listDirectory(self, path): """ Get list of files/directories for a given directory-path. @type path: string @param path: The path of the directory. @rtype: list @return: Return a list contained all files in given directory on succrss, or the length of list is zero. """ return pyhdfs.hdfsListDirectory(self.hdfs, path)
def listDirectory(self,path): """ Get list of files/directories for a given directory-path. @type path: string @param path: The path of the directory. @rtype: list @return: Return a list contained all files in given directory on succrss, or the length of list is zero. """ return pyhdfs.hdfsListDirectory(self.hdfs,path)
def hdfs_listdir(dir): #now only support listdir not glob handle = get_handle() result, num = pyhdfs.hdfsListDirectory(handle, fullpath(dir)) final_result = [ item.mName for item in [pyhdfs.hdfsFileInfo_getitem(result, i) for i in xrange(num)] if item.mSize > 0 ] print('ori result num:{}, final result num:{}'.format( num, len(final_result)), file=sys.stderr) return final_result
import os import pyhdfs name_node_address = 'nj01-nanling-hdfs.dmop.baidu.com' name_node_port = 54310 user_name = 'tuku' user_password = '******' pyhdfs.com_loadlog("./conf/", "log.conf") fs = pyhdfs.hdfsConnectAsUser(name_node_address, name_node_port, user_name, user_password) path = '/app/tuku/bianyunlong/clickquery/clickquery_merge_triplet.test' path_out = '/app/tuku/chenghuige/image-text-sim/test' hdfs_path = "hdfs://{}:{}/{}".format(name_node_address, name_node_port, path) result, num = pyhdfs.hdfsListDirectory(fs, hdfs_path) files = [item.mName for item in [pyhdfs.hdfsFileInfo_getitem(result, i) for i in xrange(num)]] import glob # for file in files: # print file # os.system('rm -rf ./test_src/*') # os.system('rm -rf ./test/*') # os.system('hadoop fs -get %s ./test_src'%file) # for file_ in glob.glob('./test_src/*'): # print file_ # os.system('python /home/img/chenghuige/tools/split.py %s'%file_) # os.system('python ./gen-records-nonpsave.py --input %s --output %s --name train'%(file_, './test')) # os.system('hadoop fs -put ./test/* %s'%(file_, path_out)) # for file_ in glob.glob('./test/*'): # if file_.endswith('.npy'):