Python PyWebHdfsClient.append_file Exemples, pywebhdfs.webhdfs.PyWebHdfsClient.append_file Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : sink_server.py Projet : kkrainov/event_trigger_json

def update_raw_stage(output, delivery_tag):

    #context = zmq.Context()

    #confirm = context.socket(zmq.PUSH)
    #confirm.connect(confirm_host)

    hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user)
    impala_conn = connect(host=impala_host, port=int(impala_port))
    cur = impala_conn.cursor()

    start_time = time.time()

    for k, v in output.iteritems():

        if (time.time() - start_time)/60 > sink_minutes:
            sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60))
        try:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

        except hdfs_err.PyWebHdfsException:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.create_file(file_name, '')
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

    #confirm.send(delivery_tag)
    sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60))
    sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))

Exemple #2

0

Afficher le fichier

Fichier : test_webhdfs.py Projet : waliaashish85/pywebhdfs

class WhenTestingAppendOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.header = {'location': self.location}
        self.response = MagicMock()

    def test_append_throws_exception_for_no_redirect(self):

        self.init_response.status_code = httplib.BAD_REQUEST
        self.response.status_code = httplib.OK
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.append_file(self.path, self.file_data)

    def test_append_throws_exception_for_not_ok(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.BAD_REQUEST
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.append_file(self.path, self.file_data)

    def test_append_returns_true(self):

        self.init_response.status_code = httplib.TEMPORARY_REDIRECT
        self.response.status_code = httplib.OK
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.append_file(self.path, self.file_data)
        self.assertTrue(result)

Exemple #3

0

Afficher le fichier

class WhenTestingAppendOperation(unittest.TestCase):
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.location = 'redirect_uri'
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.init_response = MagicMock()
        self.init_response.header = {'location': self.location}
        self.response = MagicMock()

    def test_append_throws_exception_for_no_redirect(self):

        self.init_response.status_code = http_client.BAD_REQUEST
        self.response.status_code = http_client.OK
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.append_file(self.path, self.file_data)

    def test_append_throws_exception_for_not_ok(self):

        self.init_response.status_code = http_client.TEMPORARY_REDIRECT
        self.response.status_code = http_client.BAD_REQUEST
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.append_file(self.path, self.file_data)

    def test_append_returns_true(self):

        self.init_response.status_code = http_client.TEMPORARY_REDIRECT
        self.response.status_code = http_client.OK
        self.requests.post.side_effect = [self.init_response, self.response]
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.append_file(self.path, self.file_data)
        self.assertTrue(result)

Exemple #4

0

Afficher le fichier

hdfs.make_dir(example_dir)

# get a dictionary of the directory's status
dir_status = hdfs.get_file_dir_status(example_dir)
print dir_status

# create a new file on hdfs
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print file_data

# rename the example_dir
print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)
hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir))

# list the contents of the new directory
listdir_stats = hdfs.list_dir(rename_dir)

Exemple #5

0

Afficher le fichier

Fichier : example.py Projet : avaranovich/pywebhdfs

print(dir_status)

# create a new file on hdfs
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print(file_status)

# get the checksum for the file
file_checksum = hdfs.get_file_checksum(example_file)
print(file_checksum)

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print(file_status)

# checksum reflects file changes
file_checksum = hdfs.get_file_checksum(example_file)
print(file_checksum)

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print(file_data)

# rename the example_dir
print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)

Exemple #6

0

Afficher le fichier

Fichier : deployer_utils.py Projet : karma9791/platform-deployment-manager

class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(host=host,
                                     port=port,
                                     user_name=user,
                                     timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self,
                       local_path,
                       remote_path,
                       exclude=None,
                       permission=755):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path, permission=permission)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path, permission=permission)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize('%s/%s/%s' %
                                     (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path,
                                           data,
                                           overwrite=True,
                                           permission=permission)
                    data.close()

    def make_dir(self, path, permission=755):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path), permission=permission)

    def create_file(self, data, remote_file_path, permission=755):

        logging.debug('create_file: %s', remote_file_path)

        sio = BytesIO(data)

        self._hdfs.create_file(canonicalize(remote_file_path),
                               sio,
                               overwrite=True,
                               permission=permission)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)

    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10 * 1024 * 1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path),
                                        offset=offset,
                                        length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path),
                                            offset=offset,
                                            length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)

    def file_exists(self, path):

        try:
            self._hdfs.get_file_dir_status(path)
            return True
        except:
            return False

Exemple #7

0

Afficher le fichier

Fichier : deployer_utils.py Projet : pndaproject/platform-deployment-manager

class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self, local_path, remote_path, exclude=None):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize(
                            '%s/%s/%s' %
                            (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path, data, overwrite=True)
                    data.close()

    def make_dir(self, path):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path))

    def create_file(self, data, remote_file_path):

        logging.debug('create_file: %s', remote_file_path)

        sio = StringIO.StringIO(data)

        self._hdfs.create_file(
            canonicalize(remote_file_path),
            sio,
            overwrite=True)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)


    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10*1024*1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)

Exemple #8

0

Afficher le fichier

Fichier : daemon.py Projet : Netmisa/navitia-stat-logger

class Daemon(ConsumerMixin):
    def __init__(self, config):
        self.connection = None
        self.queues = []
        self.config = config
        logging.config.dictConfig(config['logger'])
        self._init_rabbitmq()
        if self.config['storage']['localfs']:
            self.logfile = None
            self.current_logfile_path = ''
        # Initialize WebHDFS client
        if self.config['storage']['hdfs']:
            self.hdfs = PyWebHdfsClient(host=config['webhdfs']['host'],
                                        port=config['webhdfs']['port'],
                                        timeout=config['webhdfs']['timeout'])
            self.filename_template = config['webhdfs']['filename_template']

    def _init_rabbitmq(self):
        """
        connect to rabbitmq and init the queues
        """
        self.connection = kombu.Connection(
            self.config['rabbitmq']['broker_url'])
        exchange_name = self.config['rabbitmq']['exchange_name']
        exchange = kombu.Exchange(exchange_name, type="topic")
        logging.getLogger('stat_logger').info(
            "listen exchange {0:s} on {1:s}".format(
                exchange_name, self.config['rabbitmq']['broker_url']))

        queue = kombu.Queue(name=self.config['rabbitmq']['queue_name'],
                            exchange=exchange,
                            durable=False,
                            auto_delete=self.config['rabbitmq']['auto_delete'],
                            routing_key="#")
        self.queues.append(queue)

    def get_consumers(self, Consumer, channel):
        return [Consumer(queues=self.queues, callbacks=[self.process_task])]

    def process_task(self, body, message):
        stat_request = stat_logger.stat_pb2.StatRequest()
        try:
            stat_request.ParseFromString(body)
            logging.getLogger('stat_logger').debug('query received: {}'.format(
                str(stat_request)))
        except DecodeError as e:
            logging.getLogger('stat_logger').warn("message is not a valid "
                                                  "protobuf task: {}".format(
                                                      str(e)))
            message.ack()
            return

        self.log_message(stat_request)
        message.ack()

    def log_message(self, stat_hit):
        if stat_hit.IsInitialized():
            content = json.dumps(protobuf_to_dict(stat_hit),
                                 separators=(',', ':')) + '\n'

            if self.config['storage']['localfs']:
                self._reopen_logfile(
                    datetime.utcfromtimestamp(stat_hit.request_date))
                self.logfile.write(content)
                self.logfile.flush()

            if self.config['storage']['hdfs']:
                target_filename = self.filename_template.replace(
                    '{request_date}',
                    datetime.utcfromtimestamp(
                        stat_hit.request_date).strftime('%Y%m%d'))
                try:
                    self.hdfs.append_file(target_filename, content)
                except Exception as e:
                    print e

    def _reopen_logfile(self, log_date):
        expected_logfile_path = self._get_logfile_path(log_date)
        if self.current_logfile_path != expected_logfile_path:
            if self.logfile is not None:
                self.logfile.close()
            print "Opening file " + expected_logfile_path
            expected_log_dir = os.path.dirname(expected_logfile_path)
            if not os.path.isdir(expected_log_dir):
                os.makedirs(expected_log_dir)
            self.logfile = open(expected_logfile_path, 'a')
            self.current_logfile_path = expected_logfile_path

    def _get_logfile_path(self, log_date):
        return self.config['localfs']['root_dir'] + '/' + log_date.strftime(
            '%Y/%m/%d') + '/stat_log_prod_' + log_date.strftime(
                '%Y%m%d') + '_' + platform.node() + '_' + str(
                    os.getpid()) + '.json.log'

    def __del__(self):
        self.close()

    def close(self):
        if self.logfile is not None:
            self.logfile.close()
        if self.connection and self.connection.connected:
            self.connection.release()

Exemple #9

0

Afficher le fichier

Fichier : HadoopFs.py Projet : dc-sunk/cocosun

class HadoopFileSystem(BaseFs.FileSystem):
    def __init__(self,
                 vcPath,
                 simulateOnly=False,
                 isVerbose=False,
                 logger=None,
                 user=None,
                 host=None,
                 port=None):
        BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger)
        config = Config.Config()
        hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port)
        self.hdfs = PyWebHdfsClient(host=hdfsHost,
                                    port=hdfsPort,
                                    user_name=hdfsUser)
        self.vcPath = vcPath

    def make_fd(self, path, isSrc, dstDirMustExist):
        fd = None
        try:
            fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist)
        except pywebhdfs.errors.FileNotFound:
            self.logger.info("DESC: does not exist: " + path)
            raise Errors.FileNotFound("Path {0} does not exist".format(path))
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(path, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(path, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                path, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    path, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                path, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(path, e))
        return fd

    def exists_file_dir(self, fd):
        try:
            return self.hdfs.exists_file_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))

    def delete_file_dir(self, fd, recursive=False, force=False):
        if self.simulateOnly:
            print("SIMULATE -> remove file/dir: {0}, recursive={1}".format(
                fd.abspath, recursive))
        else:
            try:
                if not recursive or force or \
                        query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"):
                    status = self.hdfs.delete_file_dir(fd.abspath,
                                                       recursive=recursive)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))

    def list_dir(self, fd):
        try:
            status = self.hdfs.list_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(fd.abspath, e))
        currentDir = status["FileStatuses"]["FileStatus"]
        for item in currentDir:
            yield HadoopFileDescriptor(self,
                                       fd.abspath,
                                       isSrc=True,
                                       needsDstDirCheck=False,
                                       fileJson=item)

    def make_dir(self, path):
        if self.simulateOnly:
            print("SIMULATE -> make dir: " + path)
        else:
            try:
                self.hdfs.make_dir(path)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    path, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(path, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    path, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create directory: {0}, exc={1}"
                    .format(path, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    path, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create directory: {0}, exc={1}"
                    .format(path, e))

    def open_file(self, fd, rwMode):
        return fd

    def close_file(self, fd):
        pass

    def touch_file(self, fd):
        if self.simulateOnly:
            print("SIMULATE -> touch file: " + fd.abspath)
        else:
            try:
                self.hdfs.create_file(fd.abspath, 0, overwrite=True)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def truncate_file(self, fd, size):
        if self.simulateOnly:
            print("SIMULATE -> truncate file: {0}, size={1}".format(
                fd.abspath, size))
        else:
            try:
                self.hdfs.truncate_file(fd.abspath, size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS truncate file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS truncate file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def try_concat_files(self, fd, chunkFdList):
        # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time
        # https://issues.apache.org/jira/browse/HDFS-8891
        currIndex = 0
        concatStep = 20
        chunkedList = [
            chunkFdList[pos:pos + concatStep]
            for pos in range(0, len(chunkFdList), concatStep)
        ]
        for sourceChunk in chunkedList:
            try:
                self.concat_files(fd, sourceChunk)
                currIndex += len(sourceChunk)
            except Errors.FsException as e:
                break

        return currIndex

    def concat_files(self, fd, chunkFdList):
        strList = list()
        for chunkFd in chunkFdList:
            strList.append(chunkFd.abspath)

        if self.simulateOnly:
            print("SIMULATE -> concat file: {0}, sources={1}".format(
                fd.abspath, ",".join(strList)))
        else:
            try:
                self.hdfs.concat_files(fd.abspath, strList)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS concat file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS concat file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def read_data(self, fd, offset, size):
        if offset >= fd.size:
            return ""
        else:
            try:
                contents = self.hdfs.read_file(fd.abspath,
                                               offset=offset,
                                               length=size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS read file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS read file: {0}, exc={1}"
                    .format(fd.abspath, e))
            return contents

    def append_data(self, fd, data):
        if self.simulateOnly:
            print("SIMULATE -> write file data: " + fd.abspath)
        else:
            try:
                self.hdfs.append_file(fd.abspath, data)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS append file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS append file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def local_mv_file(self, src, dst):
        if self.simulateOnly:
            print("SIMULATE -> local move file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            try:
                self.hdfs.rename_file_dir(src.abspath, dst.abspath)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        src.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS rename file: {0}, exc={1}".
                    format(src.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS rename file: {0}, exc={1}"
                    .format(src.abspath, e))

    def local_cp_file(self, src, dst):
        # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370
        # Instead, we can do a symbolic link
        if self.simulateOnly:
            print("SIMULATE -> local copy file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            print(
                "Copy within HDFS is not supported due to lack of Hadoop support"
            )
            print(
                "Once symbolic links are enabled, this feature will be enabled"
            )
            sys.exit(1)
            # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True)

    def get_hdfs_file_dir_json(self, path):
        try:
            status = self.hdfs.get_file_dir_status(path)
            return status["FileStatus"]
        except pywebhdfs.errors.FileNotFound:
            return None

    def validate_hdfs_arg(self, arg):
        if not arg.startswith(self.vcPath):
            print("Error: You don't have permissions to the path: %s" % arg)
            print("Your path must be rooted under: %s" % self.vcPath)
            sys.exit(1)

Exemple #10

0

Afficher le fichier

Fichier : daemon.py Projet : niko64fx/navitia-stat-logger

class Daemon(ConsumerMixin):
    def __init__(self, config):
        self.connection = None
        self.queues = []
        self.config = config
        self._init_rabbitmq()
        if self.config['storage']['localfs']:
            self.logfile = None
            self.current_logfile_path=''
        # Initialize WebHDFS client
        if self.config['storage']['hdfs']:
            self.hdfs = PyWebHdfsClient(host=config['webhdfs']['host'], port=config['webhdfs']['port'], timeout=config['webhdfs']['timeout'])
            self.filename_template = config['webhdfs']['filename_template']

    def _init_rabbitmq(self):
        """
        connect to rabbitmq and init the queues
        """
        self.connection = kombu.Connection(self.config['rabbitmq']['broker-url'])
        exchange_name = self.config['rabbitmq']['exchange-name']
        exchange = kombu.Exchange(exchange_name, type="direct")
        logging.getLogger('stat_logger').info("listen following exchange: %s", exchange_name)
        print ("listen following exchange: {}".format(exchange_name))

        queue = kombu.Queue(exchange=exchange, durable=False, auto_delete=True)
        self.queues.append(queue)
        
    def get_consumers(self, Consumer, channel):
        return [Consumer(queues=self.queues, callbacks=[self.process_task])]

    def process_task(self, body, message):
        stat_request = stat_logger.stat_pb2.StatRequest()
        try:
            stat_request.ParseFromString(body)
            logging.getLogger('stat_logger').debug('query received: {}'.format(str(stat_request)))
        except DecodeError as e:
            logging.getLogger('stat_logger').warn("message is not a valid "
                "protobuf task: {}".format(str(e)))
            message.ack()
            return

        self.log_message(stat_request)
        message.ack()

    def log_message(self, stat_hit):
        if stat_hit.IsInitialized():
            content = json.dumps(protobuf_to_dict(stat_hit), separators=(',', ':')) + '\n'

            if self.config['storage']['localfs']:
                self._reopen_logfile(datetime.utcfromtimestamp(stat_hit.request_date))
                self.logfile.write(content)
                self.logfile.flush()

            if self.config['storage']['hdfs']:
                target_filename = self.filename_template.replace('{request_date}', datetime.utcfromtimestamp(stat_hit.request_date).strftime('%Y%m%d'))
                try:
                    self.hdfs.append_file(target_filename, content)
                except Exception as e:
                    print e

    def _reopen_logfile(self, log_date):
        expected_logfile_path = self._get_logfile_path(log_date)
        if self.current_logfile_path != expected_logfile_path:
            if self.logfile is not None:
                self.logfile.close()
            print "Opening file " + expected_logfile_path
            expected_log_dir = os.path.dirname(expected_logfile_path)
            if not os.path.isdir(expected_log_dir):
                os.makedirs(expected_log_dir)
            self.logfile = gzip.open(expected_logfile_path, 'a')
            self.current_logfile_path = expected_logfile_path

    def _get_logfile_path(self, log_date):
        return self.config['localfs']['root_dir'] + '/' + log_date.strftime('%Y/%m/%d') + '/stat_log_prod_' + log_date.strftime('%Y%m%d') + '_' + platform.node() + '_' + str(os.getpid()) + '.json.log.gz'

    def __del__(self):
        self.close()

    def close(self):
        if self.logfile is not None:
            self.logfile.close()
        if self.connection and self.connection.connected:
            self.connection.release()