def get_log_client(log_link): global _log_client_queue global MAX_HEAP_SIZE _log_client_lock.acquire() try: components = lib_urlsplit(log_link) base_url = '%(scheme)s://%(netloc)s' % { 'scheme': components[0], 'netloc': components[1] } # Takes on form (epoch time, client object) # Least Recently Used algorithm. client_tuple = next((tup for tup in _log_client_heap if tup[1].base_url == base_url), None) if client_tuple is None: client = HttpClient(base_url, logger=LOG) yarn_cluster = cluster.get_cluster_conf_for_job_submission() if yarn_cluster.SECURITY_ENABLED.get(): client.set_kerberos_auth() else: _log_client_heap.remove(client_tuple) client = client_tuple[1] new_client_tuple = (time.time(), client) if len(_log_client_heap) >= MAX_HEAP_SIZE: heapq.heapreplace(_log_client_heap, new_client_tuple) else: heapq.heappush(_log_client_heap, new_client_tuple) return client finally: _log_client_lock.release()
def retrieve_log_content(self, log_links, log_name, username, offset): params = { 'doAs': username } if offset != 0: params['start'] = offset if not log_name or not log_name == 'stderr': log_name = 'stdout' log = '' if log_links and log_name in log_links: log_link = log_links[log_name] root = Resource(get_log_client(log_link), lib_urlsplit(log_link)[2], urlencode=False) response = root.get('', params=params) log = html.fromstring(response, parser=html.HTMLParser()).xpath('/html/body/table/tbody/tr/td[2]')[0].text_content() return log
def urlsplit(url): """ Take an HDFS path (hdfs://nn:port/foo) or just (/foo) and split it into the standard urlsplit's 5-tuple. """ i = url.find('://') if i == -1: # Not found. Treat the entire argument as an HDFS path return ('hdfs', '', normpath(url), '', '') schema = url[:i] if schema not in ('hdfs', 'viewfs'): # Default to standard for non-hdfs return lib_urlsplit(url) url = url[i + 3:] i = url.find('/') if i == -1: # Everything is netloc. Assume path is root. return (schema, url, '/', '', '') netloc = url[:i] path = url[i:] return (schema, netloc, normpath(path), '', '')
def _get_base_url(self): split_url = lib_urlsplit(self._remote_url) return lib_urlunsplit((split_url.scheme, split_url.netloc, '', "", ""))