def update_raw_stage(output, delivery_tag): #context = zmq.Context() #confirm = context.socket(zmq.PUSH) #confirm.connect(confirm_host) hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user) impala_conn = connect(host=impala_host, port=int(impala_port)) cur = impala_conn.cursor() start_time = time.time() for k, v in output.iteritems(): if (time.time() - start_time)/60 > sink_minutes: sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60)) try: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) except hdfs_err.PyWebHdfsException: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.create_file(file_name, '') hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) #confirm.send(delivery_tag) sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60)) sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
class WhenTestingAppendOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.location = 'redirect_uri' self.path = 'user/hdfs' self.file_data = '010101' self.init_response = MagicMock() self.init_response.header = {'location': self.location} self.response = MagicMock() def test_append_throws_exception_for_no_redirect(self): self.init_response.status_code = httplib.BAD_REQUEST self.response.status_code = httplib.OK self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.append_file(self.path, self.file_data) def test_append_throws_exception_for_not_ok(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.BAD_REQUEST self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.append_file(self.path, self.file_data) def test_append_returns_true(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.OK self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.append_file(self.path, self.file_data) self.assertTrue(result)
class WhenTestingAppendOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.location = 'redirect_uri' self.path = 'user/hdfs' self.file_data = '010101' self.init_response = MagicMock() self.init_response.header = {'location': self.location} self.response = MagicMock() def test_append_throws_exception_for_no_redirect(self): self.init_response.status_code = http_client.BAD_REQUEST self.response.status_code = http_client.OK self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.append_file(self.path, self.file_data) def test_append_throws_exception_for_not_ok(self): self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.BAD_REQUEST self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.append_file(self.path, self.file_data) def test_append_returns_true(self): self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.OK self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.append_file(self.path, self.file_data) self.assertTrue(result)
hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print dir_status # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file)) hdfs.append_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print file_data # rename the example_dir print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir) hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir)) # list the contents of the new directory listdir_stats = hdfs.list_dir(rename_dir)
print(dir_status) # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print(file_status) # get the checksum for the file file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum) # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file)) hdfs.append_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print(file_status) # checksum reflects file changes file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum) # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print(file_data) # rename the example_dir print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient(host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None, permission=755): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) for fname in fnames: if fname not in exclude: data = file( canonicalize('%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True, permission=permission) data.close() def make_dir(self, path, permission=755): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path), permission=permission) def create_file(self, data, remote_file_path, permission=755): logging.debug('create_file: %s', remote_file_path) sio = BytesIO(data) self._hdfs.create_file(canonicalize(remote_file_path), sio, overwrite=True, permission=permission) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10 * 1024 * 1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive) def file_exists(self, path): try: self._hdfs.get_file_dir_status(path) return True except: return False
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) for fname in fnames: if fname not in exclude: data = file( canonicalize( '%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True) data.close() def make_dir(self, path): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path)) def create_file(self, data, remote_file_path): logging.debug('create_file: %s', remote_file_path) sio = StringIO.StringIO(data) self._hdfs.create_file( canonicalize(remote_file_path), sio, overwrite=True) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10*1024*1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive)
class Daemon(ConsumerMixin): def __init__(self, config): self.connection = None self.queues = [] self.config = config logging.config.dictConfig(config['logger']) self._init_rabbitmq() if self.config['storage']['localfs']: self.logfile = None self.current_logfile_path = '' # Initialize WebHDFS client if self.config['storage']['hdfs']: self.hdfs = PyWebHdfsClient(host=config['webhdfs']['host'], port=config['webhdfs']['port'], timeout=config['webhdfs']['timeout']) self.filename_template = config['webhdfs']['filename_template'] def _init_rabbitmq(self): """ connect to rabbitmq and init the queues """ self.connection = kombu.Connection( self.config['rabbitmq']['broker_url']) exchange_name = self.config['rabbitmq']['exchange_name'] exchange = kombu.Exchange(exchange_name, type="topic") logging.getLogger('stat_logger').info( "listen exchange {0:s} on {1:s}".format( exchange_name, self.config['rabbitmq']['broker_url'])) queue = kombu.Queue(name=self.config['rabbitmq']['queue_name'], exchange=exchange, durable=False, auto_delete=self.config['rabbitmq']['auto_delete'], routing_key="#") self.queues.append(queue) def get_consumers(self, Consumer, channel): return [Consumer(queues=self.queues, callbacks=[self.process_task])] def process_task(self, body, message): stat_request = stat_logger.stat_pb2.StatRequest() try: stat_request.ParseFromString(body) logging.getLogger('stat_logger').debug('query received: {}'.format( str(stat_request))) except DecodeError as e: logging.getLogger('stat_logger').warn("message is not a valid " "protobuf task: {}".format( str(e))) message.ack() return self.log_message(stat_request) message.ack() def log_message(self, stat_hit): if stat_hit.IsInitialized(): content = json.dumps(protobuf_to_dict(stat_hit), separators=(',', ':')) + '\n' if self.config['storage']['localfs']: self._reopen_logfile( datetime.utcfromtimestamp(stat_hit.request_date)) self.logfile.write(content) self.logfile.flush() if self.config['storage']['hdfs']: target_filename = self.filename_template.replace( '{request_date}', datetime.utcfromtimestamp( stat_hit.request_date).strftime('%Y%m%d')) try: self.hdfs.append_file(target_filename, content) except Exception as e: print e def _reopen_logfile(self, log_date): expected_logfile_path = self._get_logfile_path(log_date) if self.current_logfile_path != expected_logfile_path: if self.logfile is not None: self.logfile.close() print "Opening file " + expected_logfile_path expected_log_dir = os.path.dirname(expected_logfile_path) if not os.path.isdir(expected_log_dir): os.makedirs(expected_log_dir) self.logfile = open(expected_logfile_path, 'a') self.current_logfile_path = expected_logfile_path def _get_logfile_path(self, log_date): return self.config['localfs']['root_dir'] + '/' + log_date.strftime( '%Y/%m/%d') + '/stat_log_prod_' + log_date.strftime( '%Y%m%d') + '_' + platform.node() + '_' + str( os.getpid()) + '.json.log' def __del__(self): self.close() def close(self): if self.logfile is not None: self.logfile.close() if self.connection and self.connection.connected: self.connection.release()
class HadoopFileSystem(BaseFs.FileSystem): def __init__(self, vcPath, simulateOnly=False, isVerbose=False, logger=None, user=None, host=None, port=None): BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger) config = Config.Config() hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port) self.hdfs = PyWebHdfsClient(host=hdfsHost, port=hdfsPort, user_name=hdfsUser) self.vcPath = vcPath def make_fd(self, path, isSrc, dstDirMustExist): fd = None try: fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist) except pywebhdfs.errors.FileNotFound: self.logger.info("DESC: does not exist: " + path) raise Errors.FileNotFound("Path {0} does not exist".format(path)) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format(path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(path, e)) return fd def exists_file_dir(self, fd): try: return self.hdfs.exists_file_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) def delete_file_dir(self, fd, recursive=False, force=False): if self.simulateOnly: print("SIMULATE -> remove file/dir: {0}, recursive={1}".format( fd.abspath, recursive)) else: try: if not recursive or force or \ query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"): status = self.hdfs.delete_file_dir(fd.abspath, recursive=recursive) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) def list_dir(self, fd): try: status = self.hdfs.list_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(fd.abspath, e)) currentDir = status["FileStatuses"]["FileStatus"] for item in currentDir: yield HadoopFileDescriptor(self, fd.abspath, isSrc=True, needsDstDirCheck=False, fileJson=item) def make_dir(self, path): if self.simulateOnly: print("SIMULATE -> make dir: " + path) else: try: self.hdfs.make_dir(path) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error during HDFS create directory: {0}, exc={1}" .format(path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened during HDFS create directory: {0}, exc={1}" .format(path, e)) def open_file(self, fd, rwMode): return fd def close_file(self, fd): pass def touch_file(self, fd): if self.simulateOnly: print("SIMULATE -> touch file: " + fd.abspath) else: try: self.hdfs.create_file(fd.abspath, 0, overwrite=True) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS create file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS create file: {0}, exc={1}" .format(fd.abspath, e)) def truncate_file(self, fd, size): if self.simulateOnly: print("SIMULATE -> truncate file: {0}, size={1}".format( fd.abspath, size)) else: try: self.hdfs.truncate_file(fd.abspath, size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS truncate file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS truncate file: {0}, exc={1}" .format(fd.abspath, e)) def try_concat_files(self, fd, chunkFdList): # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time # https://issues.apache.org/jira/browse/HDFS-8891 currIndex = 0 concatStep = 20 chunkedList = [ chunkFdList[pos:pos + concatStep] for pos in range(0, len(chunkFdList), concatStep) ] for sourceChunk in chunkedList: try: self.concat_files(fd, sourceChunk) currIndex += len(sourceChunk) except Errors.FsException as e: break return currIndex def concat_files(self, fd, chunkFdList): strList = list() for chunkFd in chunkFdList: strList.append(chunkFd.abspath) if self.simulateOnly: print("SIMULATE -> concat file: {0}, sources={1}".format( fd.abspath, ",".join(strList))) else: try: self.hdfs.concat_files(fd.abspath, strList) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS concat file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS concat file: {0}, exc={1}" .format(fd.abspath, e)) def read_data(self, fd, offset, size): if offset >= fd.size: return "" else: try: contents = self.hdfs.read_file(fd.abspath, offset=offset, length=size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS read file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS read file: {0}, exc={1}" .format(fd.abspath, e)) return contents def append_data(self, fd, data): if self.simulateOnly: print("SIMULATE -> write file data: " + fd.abspath) else: try: self.hdfs.append_file(fd.abspath, data) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS append file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS append file: {0}, exc={1}" .format(fd.abspath, e)) def local_mv_file(self, src, dst): if self.simulateOnly: print("SIMULATE -> local move file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: try: self.hdfs.rename_file_dir(src.abspath, dst.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( src.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( src.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( src.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS rename file: {0}, exc={1}". format(src.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( src.abspath, e)) raise Errors.FsException( "An exception happened during HDFS rename file: {0}, exc={1}" .format(src.abspath, e)) def local_cp_file(self, src, dst): # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370 # Instead, we can do a symbolic link if self.simulateOnly: print("SIMULATE -> local copy file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: print( "Copy within HDFS is not supported due to lack of Hadoop support" ) print( "Once symbolic links are enabled, this feature will be enabled" ) sys.exit(1) # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True) def get_hdfs_file_dir_json(self, path): try: status = self.hdfs.get_file_dir_status(path) return status["FileStatus"] except pywebhdfs.errors.FileNotFound: return None def validate_hdfs_arg(self, arg): if not arg.startswith(self.vcPath): print("Error: You don't have permissions to the path: %s" % arg) print("Your path must be rooted under: %s" % self.vcPath) sys.exit(1)
class Daemon(ConsumerMixin): def __init__(self, config): self.connection = None self.queues = [] self.config = config self._init_rabbitmq() if self.config['storage']['localfs']: self.logfile = None self.current_logfile_path='' # Initialize WebHDFS client if self.config['storage']['hdfs']: self.hdfs = PyWebHdfsClient(host=config['webhdfs']['host'], port=config['webhdfs']['port'], timeout=config['webhdfs']['timeout']) self.filename_template = config['webhdfs']['filename_template'] def _init_rabbitmq(self): """ connect to rabbitmq and init the queues """ self.connection = kombu.Connection(self.config['rabbitmq']['broker-url']) exchange_name = self.config['rabbitmq']['exchange-name'] exchange = kombu.Exchange(exchange_name, type="direct") logging.getLogger('stat_logger').info("listen following exchange: %s", exchange_name) print ("listen following exchange: {}".format(exchange_name)) queue = kombu.Queue(exchange=exchange, durable=False, auto_delete=True) self.queues.append(queue) def get_consumers(self, Consumer, channel): return [Consumer(queues=self.queues, callbacks=[self.process_task])] def process_task(self, body, message): stat_request = stat_logger.stat_pb2.StatRequest() try: stat_request.ParseFromString(body) logging.getLogger('stat_logger').debug('query received: {}'.format(str(stat_request))) except DecodeError as e: logging.getLogger('stat_logger').warn("message is not a valid " "protobuf task: {}".format(str(e))) message.ack() return self.log_message(stat_request) message.ack() def log_message(self, stat_hit): if stat_hit.IsInitialized(): content = json.dumps(protobuf_to_dict(stat_hit), separators=(',', ':')) + '\n' if self.config['storage']['localfs']: self._reopen_logfile(datetime.utcfromtimestamp(stat_hit.request_date)) self.logfile.write(content) self.logfile.flush() if self.config['storage']['hdfs']: target_filename = self.filename_template.replace('{request_date}', datetime.utcfromtimestamp(stat_hit.request_date).strftime('%Y%m%d')) try: self.hdfs.append_file(target_filename, content) except Exception as e: print e def _reopen_logfile(self, log_date): expected_logfile_path = self._get_logfile_path(log_date) if self.current_logfile_path != expected_logfile_path: if self.logfile is not None: self.logfile.close() print "Opening file " + expected_logfile_path expected_log_dir = os.path.dirname(expected_logfile_path) if not os.path.isdir(expected_log_dir): os.makedirs(expected_log_dir) self.logfile = gzip.open(expected_logfile_path, 'a') self.current_logfile_path = expected_logfile_path def _get_logfile_path(self, log_date): return self.config['localfs']['root_dir'] + '/' + log_date.strftime('%Y/%m/%d') + '/stat_log_prod_' + log_date.strftime('%Y%m%d') + '_' + platform.node() + '_' + str(os.getpid()) + '.json.log.gz' def __del__(self): self.close() def close(self): if self.logfile is not None: self.logfile.close() if self.connection and self.connection.connected: self.connection.release()