Example #1
0
def glob(file_pattern):
    """
  @depreciated
  Now I only find pyhdfs.hdfsListDirectory, @TODO support hdfsGlob 
  """
    if file_pattern.startswith(HDFS_START):
        handle = get_handle()
        # hdfs://
        return pyhdfs.hdfsListDirectory(handle, file_pattern)
    elif file_pattern.startswith(START_DIR):
        handle = get_handle()
        # /app/tuku/..
        result, num = pyhdfs.hdfsListDirectory(handle, fullpath(file_pattern))
        return [
            item.mName for item in
            [pyhdfs.hdfsFileInfo_getitem(result, i) for i in xrange(num)]
            if item.mSize > 0
        ]
    return local_glob.glob(file_pattern)
Example #2
0
    def listDirectory(self, path):
        """
            Get list of files/directories for a given directory-path. 

            @type path: string
            @param path: The path of the directory. 

            @rtype: list
            @return: Return a list contained all files in given directory on succrss, or the length of list is zero.
        """
        return pyhdfs.hdfsListDirectory(self.hdfs, path)
Example #3
0
    def listDirectory(self,path):
        """
            Get list of files/directories for a given directory-path. 

            @type path: string
            @param path: The path of the directory. 

            @rtype: list
            @return: Return a list contained all files in given directory on succrss, or the length of list is zero.
        """
        return pyhdfs.hdfsListDirectory(self.hdfs,path)
Example #4
0
def hdfs_listdir(dir):
    #now only support listdir not glob
    handle = get_handle()
    result, num = pyhdfs.hdfsListDirectory(handle, fullpath(dir))
    final_result = [
        item.mName for item in
        [pyhdfs.hdfsFileInfo_getitem(result, i) for i in xrange(num)]
        if item.mSize > 0
    ]
    print('ori result num:{}, final result num:{}'.format(
        num, len(final_result)),
          file=sys.stderr)
    return final_result
Example #5
0
import os
import pyhdfs

name_node_address = 'nj01-nanling-hdfs.dmop.baidu.com'
name_node_port = 54310
user_name = 'tuku'
user_password = '******'

pyhdfs.com_loadlog("./conf/", "log.conf")
fs = pyhdfs.hdfsConnectAsUser(name_node_address, name_node_port, user_name, user_password)

path = '/app/tuku/bianyunlong/clickquery/clickquery_merge_triplet.test'
path_out = '/app/tuku/chenghuige/image-text-sim/test'
hdfs_path = "hdfs://{}:{}/{}".format(name_node_address, name_node_port, path)
result, num = pyhdfs.hdfsListDirectory(fs, hdfs_path)
files = [item.mName for item in [pyhdfs.hdfsFileInfo_getitem(result, i) for i in xrange(num)]]

import glob
# for file in files:
#   print file
#   os.system('rm -rf ./test_src/*')
#   os.system('rm -rf ./test/*')  
#   os.system('hadoop fs -get %s ./test_src'%file)
#   for file_ in glob.glob('./test_src/*'):
#     print file_
#     os.system('python /home/img/chenghuige/tools/split.py %s'%file_)
#     os.system('python ./gen-records-nonpsave.py --input %s --output %s --name train'%(file_, './test'))
#   os.system('hadoop fs -put ./test/* %s'%(file_, path_out))

# for file_ in glob.glob('./test/*'):
#   if file_.endswith('.npy'):