Esempio n. 1
0
class WhenTestingListDirOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatuses": {
                "FileStatus": [
                    {
                        "accessTime": 0,
                        "blockSize": 0,
                        "group": "supergroup",
                        "length": 24930,
                        "modificationTime": 1320173277227,
                        "owner": "webuser",
                        "pathSuffix": "a.patch",
                        "permission": "777",
                        "replication": 0,
                        "type": "FILE"
                    },
                    {
                        "accessTime": 0,
                        "blockSize": 0,
                        "group": "supergroup",
                        "length": 0,
                        "modificationTime": 1320173277227,
                        "owner": "webuser",
                        "pathSuffix": "",
                        "permission": "777",
                        "replication": 0,
                        "type": "DIRECTORY"
                    }
                ]
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)

    @patch.object(Session, 'get')
    def test_get_status_throws_exception_for_not_ok(self, mock_get):

        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.list_dir(self.path)

    @patch.object(Session, 'get')
    def test_get_status_returns_true(self, mock_get):

        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.list_dir(self.path)

        for key in result:
            self.assertEqual(result[key], self.file_status[key])
Esempio n. 2
0
File: hdfs.py Progetto: bkanuka/pymc
class HDFS(NDArray):
	'''
	HDFS storage

	Parameters
	----------
	name : str
		Name of directory to store text files (Path to the directory) without
		a leading '/'
	model : Model
		If None, the model is taken from the 'with' context
	vars : list of variables
		Sampling values will be stored for these variables. If None.
		'model.unobserved_RVs' is used
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None
	'''
	def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None):
		self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
		try:
			self.hdfs.list_dir(name)
		except FileNotFound:
			self.hdfs.make_dir(name)
		super(HDFS, self).__init__(name, model, vars)

	def close(self):
		super(HDFS, self).close()
		_dump_trace(self.name, self)
Esempio n. 3
0
class HdfsHandler:
    def __init__(self, hadoopHost, hadopPort='50070', user='******'):
        # self.hdfs = PyWebHdfsClient(host='52.14.121.163', port='50070', user_name='hadoop')
        self.hdfs = PyWebHdfsClient(host=hadoopHost,
                                    port=hadopPort,
                                    user_name=user)
        self.s3_client = boto3.client('s3')

    def copyToHDFS(self, src_path, hdfs_path):
        if hdfs_path.startswith("hdfs"):
            temp_path = hdfs_path.split("8020")
            self.new_hdfs_path = temp_path[1] + '/lib'
            print "New Path: %s" % self.new_hdfs_path
        # create a new client instance
        # print "New Path: %s" % self.new_hdfs_path[1]
        jar_name = os.path.basename(src_path)
        print src_path
        fileContent = open(src_path, 'rb').read()

        # copies file to local for testing purpose
        # with open("E:/temp/java-0.0.2.jar", "wb") as jarfile:
        #     jarfile.write(fileContent)

        # create a new file on hdfs
        print('making new file at: {0}\n'.format(jar_name))
        result = self.hdfs.create_file(self.new_hdfs_path + "/" + jar_name,
                                       fileContent,
                                       overwrite=True)
        print "HDFS Copy Result: %s" % result
        return result

    def list_hdfs_dir(self, hdfs_path):
        print self.hdfs.list_dir(hdfs_path)
Esempio n. 4
0
def downParts(fpath):
    '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中'
    from pywebhdfs.webhdfs import PyWebHdfsClient
    hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci')
    flist = hdfs.list_dir(fpath)
    x = flist['FileStatuses']['FileStatus']
    _SUCCESS = False
    for f in x:
        if f['pathSuffix'] == '_SUCCESS':
            _SUCCESS = True
            break
    if not _SUCCESS:
        print("not complete yet!")
        return
    fnames = [
        f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-')
    ]
    fnames1 = sorted(fnames)
    foutname = fpath[fpath.rfind('/') + 1:]
    l = len(fnames1)
    with open(foutname, "wb") as fo:
        for fname in fnames1:
            fpath1 = fpath + "/" + fname
            fo.write(hdfs.read_file(fpath1))
            print(" progress: ", fname, l)
Esempio n. 5
0
class Store (store.Store):
    """
    HDFS backed store.
    """

    def __init__ (self):
        """ Connect to store """
        self._client = PyWebHdfsClient(host=store_host, port=store_port, user_name=store_user)

    def mkdir (self, path):
        self._client.make_dir(path)

    def read (self, path, open_handle):
        return StoreFile(self._client, path, "r", open_handle)

    def append (self, path, open_handle):
        return StoreFile(self._client, path, "a", open_handle)

    def write (self, path, open_handle):
        return StoreFile(self._client, path, "w", open_handle)

    def exists (self, path):
        try:
            dirinfo = self._client.list_dir(path)
            return True
        except errors.FileNotFound:
            return False
    
    def walk (self, path, visitor, recursive = False):
        """ Walk files in a path. Use recursive=True to include subdirs """
        dirinfo = self._client.list_dir(path)
        for status in dirinfo["FileStatuses"]["FileStatus"]:
            if recursive and status["type"] == "DIRECTORY":
                if len(path) > 0:
                    self.walk(path + "/" + status["pathSuffix"], visitor, recursive)
                else:
                    self.walk(status["pathSuffix"], visitor, recursive)
            else:
                info = dict(name=status["pathSuffix"], 
                            modify=datetime.fromtimestamp(status["modificationTime"]), 
                            size=status["length"])
                visitor(path, info)
Esempio n. 6
0
class HDFS(NDArray):
    '''
	HDFS storage

	Parameters
	----------
	name : str
		Name of directory to store text files (Path to the directory) without
		a leading '/'
	model : Model
		If None, the model is taken from the 'with' context
	vars : list of variables
		Sampling values will be stored for these variables. If None.
		'model.unobserved_RVs' is used
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None
	'''
    def __init__(self,
                 name,
                 model=None,
                 vars=None,
                 host='localhost',
                 port='50070',
                 user_name=None):
        self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        try:
            self.hdfs.list_dir(name)
        except FileNotFound:
            self.hdfs.make_dir(name)
        super(HDFS, self).__init__(name, model, vars)

    def close(self):
        super(HDFS, self).close()
        _dump_trace(self.name, self)
Esempio n. 7
0
class HdfsHandler(object):
    def __init__(self):
        self._HDFS = PyWebHdfsClient(host='10.81.1.160',
                                     port='50070',
                                     user_name='hdfs')

    def readFile(self, file):
        dirToRead = "%s/%s" % (LOG_ROOT_DIR, file)
        dataOut = self._HDFS.list_dir(dirToRead)
        fileToRead = "%s/%s" % (
            dirToRead, dataOut['FileStatuses']['FileStatus'][1]['pathSuffix'])
        return self._HDFS.read_file(fileToRead)
Esempio n. 8
0
# checksum reflects file changes
file_checksum = hdfs.get_file_checksum(example_file)
print(file_checksum)

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print(file_data)

# rename the example_dir
print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)
hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir))

# list the contents of the new directory
listdir_stats = hdfs.list_dir(rename_dir)
print(listdir_stats)

example_file = '{dir}/example.txt'.format(dir=rename_dir)

# delete the example file
print('deleting example file at: {0}'.format(example_file))
hdfs.delete_file_dir(example_file)

# list the contents of the directory
listdir_stats = hdfs.list_dir(rename_dir)
print(listdir_stats)

# delete the example directory
print('deleting the example directory at: {0}'.format(rename_dir))
hdfs.delete_file_dir(rename_dir, recursive='true')
Esempio n. 9
0
from pywebhdfs.webhdfs import PyWebHdfsClient
import logging
from pprint import pprint

logging.basicConfig(level=logging.DEBUG)
_LOG = logging.getLogger(__name__)


#host= your server address.
hdfs = PyWebHdfsClient(host='',port='50070', user_name='hduser',timeout=4)  # your Namenode IP & username here
my_dir = '/user/hduser/sample'
fileFinal=my_dir+'/file.txt'
pprint(hdfs.list_dir(my_dir))


dir_status = hdfs.get_file_dir_status(my_dir)
print dir_status
print "Reading file from hadoop hdfs"
file_data = hdfs.read_file("user/hduser/sample/file.txt")

print file_data
Esempio n. 10
0
if __name__ == '__main__':
    global data_creator; #one object to hold all the data to stream to the client
    global date_list; #one list object to hold all the dates whose data are stored in HDFS
    #starting the new thread to run server separately
    server = Thread(target=runServer);
    server.start();
    #creating wall coordinates list for use later in this view
    DataCreator.read_transformed_coordinates_to_array();
    #instance that creates data in different format for each date
    data_creator =  DataCreator();  
    date_list=[];
     
    from pywebhdfs.webhdfs import PyWebHdfsClient;
    hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya');
    
    dir_listing = hdfs.list_dir('user/uacharya/flow')
    #list of dir dictionaries
    ls_dir = dir_listing['FileStatuses']['FileStatus']
    #appending to the list all the date which data is stored in hdfs
    for d in ls_dir:
        date_list.append(int(d['pathSuffix']))
        
    #creating required data in memory for all the data available   
    for date in date_list:  
        print("started creaing data for date {}".format(date))  
        data_creator.create_data_for_date(date);
        print(data_creator.check_available_data(date,aggregated=True))
        print(data_creator.check_available_data(date,bitmap=True))
    
    
Esempio n. 11
0
from os.path import join
from pywebhdfs.webhdfs import PyWebHdfsClient

def formatLine(line):
    line = line.values()
    line[0], line[5] = line[5], line[0]
    return ', '.join(line)

if __name__ == '__main__':

    host = 'hdfs://localhost:9000'
    ticker_path = host + '/user/hadoop/tickers.txt'
    save_path = host + '/user/hadoop/stock'

    hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hadoop')
    folder = hdfs.list_dir('user/hadoop/stock')['FileStatuses']['FileStatus']
    files = sorted([dt.datetime.strptime(f['pathSuffix'].split('.')[0], '%Y-%m-%d').date() for f in folder])
    end = dt.date.today().strftime('%Y-%m-%d')
    
    sc = SparkContext(appName='stock_data')
    
    if len(files) > 3:
        hdfs.delete_file_dir(join(save_path, files[0].strftime('%Y-%m-%d') + '.csv'), recursive=True)

    if len(files) == 0:
        start = '2014-01-01'
        stockData = sc.textFile(ticker_path).flatMap(lambda x: Share(x).get_historical(start, end)).map(formatLine)
        stockData.saveAsTextFile(join(save_path, end + '.csv'))
    else:
        start = (files[-1] + dt.timedelta(days=1)).strftime('%Y-%m-%d')
        histStockData = sc.textFile(join(save_path, files[-1].strftime('%Y-%m-%d') + '.csv'))
Esempio n. 12
0
def prepare_for_CPP_converter(task_name, in_dir, ofile_path, hdfs_host,
                              hdfs_port, hdfs_user):
    hdfs_client = PyWebHdfsClient(host=hdfs_host,
                                  port=str(hdfs_port),
                                  user_name=hdfs_user)
    file_statuses = hdfs_client.list_dir(in_dir)["FileStatuses"]["FileStatus"]
    # pprint(file_statuses)
    for file_status in file_statuses:
        file_list = sorted([
            file_status["pathSuffix"] for file_status in file_statuses
            if "COPY" not in file_status["pathSuffix"]
        ],
                           key=lambda x: int(x.split(".")[-1]))

    ofile = open(ofile_path, "wb")
    req_buffer = defaultdict(list)
    finished_log_idx, n_req_total, last_write_ts, obj_id_mapping = -1, 0, 0, {}

    for ifile_name in file_list:
        n_req = 0
        ifile_path = "{}/{}".format(in_dir, ifile_name)
        file_idx = int(ifile_name.split(".")[-1])
        if file_idx < finished_log_idx:
            logging.info("skip {}".format(ifile_path))
        trace_reader = TraceReaderHDFS(ifile_path,
                                       "twr",
                                       hdfs_host,
                                       hdfs_port,
                                       hdfs_user,
                                       hdfs_buffer_size=128 * MB)

        raw_line = trace_reader.read_one_req()
        while raw_line:
            line = raw_line.decode()
            n_req += 1
            ts, obj_id, ksize, vsize, client_id, op, namespace, ttl = _parse_log_line(
                line,
                last_write_ts,
                line_no=n_req,
                ifile_path="{}/{}".format(in_dir, ifile_name))

            if last_write_ts == 0:
                last_write_ts = ts - 1

            # gizmoduck start time
            if task_name == "gizmoduck_cache" or task_name == "wtf_req_cache":
                if ts < 1585706400:
                    raw_line = trace_reader.read_one_req()
                    continue
                # # gizmoduck end time
                # if ts > 1585706400 + 3600 * 24 * 2:
                #   break

            # friday 4pm
            if ts < 1585324800:
                raw_line = trace_reader.read_one_req()
                continue

            # next friday 4pm GMT
            if ts > 1585929600:
                break

            req = Req(real_time=ts,
                      obj_id=obj_id,
                      key_size=ksize,
                      value_size=vsize,
                      op=op,
                      ttl=ttl)
            last_write_ts = buffer_req_to_guarantee_time_monotonic(
                req, req_buffer, ofile, last_write_ts, 2)
            raw_line = trace_reader.read_one_req()
        n_req_total += n_req
        t1 = time.time()
        logging.info(
            "finish converting one file - {} kReq / total {} kReq, {} kobj, dump time {:.2f}s ifile {}"
            .format(n_req // 1000, n_req_total // 1000,
                    len(obj_id_mapping) // 1000,
                    time.time() - t1, "/".join(ifile_path.split("/")[-4:])))

    for ts in sorted(req_buffer.keys()):
        for req in req_buffer[ts]:
            _write_to_ofile(req, ofile)

    logging.info(
        "****************************** finish all conversion {} kReq, ofile {}"
        .format(n_req_total // 1000, ofile_path))
    with open("conversion.finish", "a") as ofile2:
        ofile2.write("{}\n".format(in_dir))

    ofile.flush()
    ofile.close()
    return n_req_total, len(obj_id_mapping)
Esempio n. 13
0
class HadoopFileSystem(BaseFs.FileSystem):
    def __init__(self,
                 vcPath,
                 simulateOnly=False,
                 isVerbose=False,
                 logger=None,
                 user=None,
                 host=None,
                 port=None):
        BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger)
        config = Config.Config()
        hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port)
        self.hdfs = PyWebHdfsClient(host=hdfsHost,
                                    port=hdfsPort,
                                    user_name=hdfsUser)
        self.vcPath = vcPath

    def make_fd(self, path, isSrc, dstDirMustExist):
        fd = None
        try:
            fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist)
        except pywebhdfs.errors.FileNotFound:
            self.logger.info("DESC: does not exist: " + path)
            raise Errors.FileNotFound("Path {0} does not exist".format(path))
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(path, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(path, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                path, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    path, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                path, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(path, e))
        return fd

    def exists_file_dir(self, fd):
        try:
            return self.hdfs.exists_file_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))

    def delete_file_dir(self, fd, recursive=False, force=False):
        if self.simulateOnly:
            print("SIMULATE -> remove file/dir: {0}, recursive={1}".format(
                fd.abspath, recursive))
        else:
            try:
                if not recursive or force or \
                        query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"):
                    status = self.hdfs.delete_file_dir(fd.abspath,
                                                       recursive=recursive)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))

    def list_dir(self, fd):
        try:
            status = self.hdfs.list_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(fd.abspath, e))
        currentDir = status["FileStatuses"]["FileStatus"]
        for item in currentDir:
            yield HadoopFileDescriptor(self,
                                       fd.abspath,
                                       isSrc=True,
                                       needsDstDirCheck=False,
                                       fileJson=item)

    def make_dir(self, path):
        if self.simulateOnly:
            print("SIMULATE -> make dir: " + path)
        else:
            try:
                self.hdfs.make_dir(path)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    path, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(path, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    path, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create directory: {0}, exc={1}"
                    .format(path, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    path, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create directory: {0}, exc={1}"
                    .format(path, e))

    def open_file(self, fd, rwMode):
        return fd

    def close_file(self, fd):
        pass

    def touch_file(self, fd):
        if self.simulateOnly:
            print("SIMULATE -> touch file: " + fd.abspath)
        else:
            try:
                self.hdfs.create_file(fd.abspath, 0, overwrite=True)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def truncate_file(self, fd, size):
        if self.simulateOnly:
            print("SIMULATE -> truncate file: {0}, size={1}".format(
                fd.abspath, size))
        else:
            try:
                self.hdfs.truncate_file(fd.abspath, size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS truncate file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS truncate file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def try_concat_files(self, fd, chunkFdList):
        # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time
        # https://issues.apache.org/jira/browse/HDFS-8891
        currIndex = 0
        concatStep = 20
        chunkedList = [
            chunkFdList[pos:pos + concatStep]
            for pos in range(0, len(chunkFdList), concatStep)
        ]
        for sourceChunk in chunkedList:
            try:
                self.concat_files(fd, sourceChunk)
                currIndex += len(sourceChunk)
            except Errors.FsException as e:
                break

        return currIndex

    def concat_files(self, fd, chunkFdList):
        strList = list()
        for chunkFd in chunkFdList:
            strList.append(chunkFd.abspath)

        if self.simulateOnly:
            print("SIMULATE -> concat file: {0}, sources={1}".format(
                fd.abspath, ",".join(strList)))
        else:
            try:
                self.hdfs.concat_files(fd.abspath, strList)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS concat file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS concat file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def read_data(self, fd, offset, size):
        if offset >= fd.size:
            return ""
        else:
            try:
                contents = self.hdfs.read_file(fd.abspath,
                                               offset=offset,
                                               length=size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS read file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS read file: {0}, exc={1}"
                    .format(fd.abspath, e))
            return contents

    def append_data(self, fd, data):
        if self.simulateOnly:
            print("SIMULATE -> write file data: " + fd.abspath)
        else:
            try:
                self.hdfs.append_file(fd.abspath, data)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS append file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS append file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def local_mv_file(self, src, dst):
        if self.simulateOnly:
            print("SIMULATE -> local move file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            try:
                self.hdfs.rename_file_dir(src.abspath, dst.abspath)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        src.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS rename file: {0}, exc={1}".
                    format(src.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS rename file: {0}, exc={1}"
                    .format(src.abspath, e))

    def local_cp_file(self, src, dst):
        # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370
        # Instead, we can do a symbolic link
        if self.simulateOnly:
            print("SIMULATE -> local copy file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            print(
                "Copy within HDFS is not supported due to lack of Hadoop support"
            )
            print(
                "Once symbolic links are enabled, this feature will be enabled"
            )
            sys.exit(1)
            # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True)

    def get_hdfs_file_dir_json(self, path):
        try:
            status = self.hdfs.get_file_dir_status(path)
            return status["FileStatus"]
        except pywebhdfs.errors.FileNotFound:
            return None

    def validate_hdfs_arg(self, arg):
        if not arg.startswith(self.vcPath):
            print("Error: You don't have permissions to the path: %s" % arg)
            print("Your path must be rooted under: %s" % self.vcPath)
            sys.exit(1)
Esempio n. 14
0
class WhenTestingListDirOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatuses": {
                "FileStatus": [
                    {
                        "accessTime": 0,
                        "blockSize": 0,
                        "group": "supergroup",
                        "length": 24930,
                        "modificationTime": 1320173277227,
                        "owner": "webuser",
                        "pathSuffix": "a.patch",
                        "permission": "777",
                        "replication": 0,
                        "type": "FILE"
                    },
                    {
                        "accessTime": 0,
                        "blockSize": 0,
                        "group": "supergroup",
                        "length": 0,
                        "modificationTime": 1320173277227,
                        "owner": "webuser",
                        "pathSuffix": "",
                        "permission": "777",
                        "replication": 0,
                        "type": "DIRECTORY"
                    }
                ]
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)

    def test_get_status_throws_exception_for_not_ok(self):

        self.response.status_code = httplib.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.list_dir(self.path)

    def test_get_status_returns_true(self):

        self.response.status_code = httplib.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.list_dir(self.path)

        for key in result:
            self.assertEqual(result[key], self.file_status[key])
Esempio n. 15
0
    def upload_to_hdfs(self, local_file, table, index):
        '''
            upload file from local filesystem to hdfs
        '''
        hiveOper = hive_op.HiveOperation()
        local_dir = self._conf.get('local', 'data_dir')
        local_path = '{}{}/{}'.format(local_dir, index, local_file)
        host1 = self._conf.get('hdfs', 'name_node1')
        host2 = self._conf.get('hdfs', 'name_node2')
        user = self._conf.get('hdfs', 'user')
        port = self._conf.getint('hdfs', 'port')
        hdfs_base_path = self._conf.get('hdfs', 'upload_path')
        hdfs_dir_path = '{}{}'.format(hdfs_base_path, index)
        hdfs_path = '{}{}/{}'.format(hdfs_base_path, index, local_file)

        #implement HA manually
        try:
            hdfs_cli = PyWebHdfsClient(host=host1, port=port, user_name=user)
            hdfs_cli.list_dir('/')
        except Exception as e:
            logger.warn('open hdfs client failed error {}'.format(e))
            hdfs_cli = PyWebHdfsClient(host=host2, port=port, user_name=user)
            hdfs_cli.list_dir('/')

        if hdfs_cli is None:
            logger.error('no active host')
            return None

        try:
            hdfs_cli.get_file_dir_status(hdfs_path)

            # 若hdfs中临时文件存在,表示可能是上次上传hive失败,或者进程中途被杀导致
            # 先将临时文件中的数据导入hive,再进行下一步操作
            ret = hiveOper.load_hdfs_file_into_tmp_table(hdfs_path, table)
            if ret == -1:
                logger.error('load from hdfs to tmp table failed')

            logger.info('last time! {} load into tmp finished'.format(table))
            hiveOper.load_tmp_table_to_main(table)
            logger.info(
                'last time! {} load tmp table to main finished'.format(table))

        #FileNotFountException
        except Exception as e:
            #文件不存在是正常情况
            logger.debug('no such file {}'.format(hdfs_path))

        retry_count = 0
        upload_finished = False
        while retry_count <= 10 and not upload_finished:
            with open(local_path) as f:
                logger.debug('''local path is {}, hdfs_cli is {}, 
                    file is {}, hdfs_path is {}'''.format(
                    local_path, hdfs_cli, f, hdfs_path))
                #hdfs_cli.delete_file_dir(hdfs_path)

                #若目录不存在,先创建目录
                try:
                    hdfs_cli.get_file_dir_status(hdfs_dir_path)
                except Exception as e:
                    hdfs_cli.make_dir(hdfs_dir_path)

                try:
                    hdfs_cli.create_file(hdfs_path, f)
                    upload_finished = True
                except Exception as e:
                    logger.warn('''create file on hdfs failed, 
                        local path is {}, hdfs path is {}, 
                        retry count {}, upload flag {}'''.format(
                        local_path, hdfs_path, retry_count, upload_finished))
                    logger.warn('error is {}'.format(e))
                    retry_count += 1

        if retry_count <= 10:
            return hdfs_path
        else:
            logger.error('''{} upload 10 times, still failed, 
                retry count {}, upload_flag is {}'''.format(
                local_path, retry_count, upload_finished))
            return None
Esempio n. 16
0
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print file_data

# rename the example_dir
print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)
hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir))

# list the contents of the new directory
listdir_stats = hdfs.list_dir(rename_dir)
print listdir_stats

example_file = '{dir}/example.txt'.format(dir=rename_dir)

# delete the example file
print('deleting example file at: {0}'.format(example_file))
hdfs.delete_file_dir(example_file)

# list the contents of the directory
listdir_stats = hdfs.list_dir(rename_dir)
print listdir_stats

# delete the example directory
print('deleting the example directory at: {0}'.format(rename_dir))
hdfs.delete_file_dir(rename_dir, recursive='true')

if __name__ == "__main__":

    # set up spark env
    spark = (SparkSession.builder.appName(
        'Data collection with hive').enableHiveSupport().getOrCreate())

    sc = spark.sparkContext

    hdfs = PyWebHdfsClient(host='r-9arp1kfy-0.localdomain',
                           port='50070',
                           user_name='nikikiq')
    data_dir = str(sys.argv[1])
    dump_limit = 2000
    dir_status = hdfs.list_dir(data_dir)
    files = get_files(dir_status['FileStatuses']['FileStatus'])

    if files == None:
        print('No data available, please try again!')
    else:
        # checked logged files and files which may fail due to various errors
        try:
            with open('/vdc/team40/nia_test/collected/file_log.txt',
                      'r') as my_log:
                logged_files = [line.strip() for line in my_log]
                my_log.close()

        except FileNotFoundError:
            logged_files = []