def _is_inference_valid(sample): """ Check whether the inference data is empty or have the same length. If probs have different length with the labels, it can be confusing when assigning each prob to label. '_is_inference_valid' returns True only when the data size of match to each other. Note that prob data could be empty, so empty prob will pass the check. """ ground_truth_len = len(sample['ground_truth_label']) for name in [ 'ground_truth_prob', 'ground_truth_prob_sd', 'ground_truth_prob_itl95_low', 'ground_truth_prob_itl95_hi' ]: if sample[name] and len(sample[name]) != ground_truth_len: logger.info( 'Length of %s not match the ground_truth_label. Length of ground_truth_label: %d,' 'length of %s: %d', name, ground_truth_len, name, len(sample[name])) return False predicted_len = len(sample['predicted_label']) for name in [ 'predicted_prob', 'predicted_prob_sd', 'predicted_prob_itl95_low', 'predicted_prob_itl95_hi' ]: if sample[name] and len(sample[name]) != predicted_len: logger.info( 'Length of %s not match the predicted_labels. Length of predicted_label: %d,' 'length of %s: %d', name, predicted_len, name, len(sample[name])) return False return True
def load(self): """Start loading data from the latest summary file to the loader.""" self.status = _LoaderStatus.LOADING.value filenames = [] for filename in FileHandler.list_dir(self._loader_info['summary_dir']): if FileHandler.is_file( FileHandler.join(self._loader_info['summary_dir'], filename)): filenames.append(filename) filenames = ExplainLoader._filter_files(filenames) if not filenames: raise TrainJobNotExistError( 'No summary file found in %s, explain job will be delete.' % self._loader_info['summary_dir']) is_end = False while not is_end and self.status != _LoaderStatus.STOP.value: try: file_changed, is_end, event_dict = self._parser.list_events( filenames) except UnknownError: break if file_changed: logger.info( 'Summary file in %s update, reload the data in the summary.', self._loader_info['summary_dir']) self._clear_job() if event_dict: self._import_data_from_event(event_dict)
def load(self): """Start loading data from the latest summary file to the loader.""" filenames = [] for filename in FileHandler.list_dir(self._loader_info['summary_dir']): if FileHandler.is_file( FileHandler.join(self._loader_info['summary_dir'], filename)): filenames.append(filename) filenames = ExplainLoader._filter_files(filenames) if not filenames: raise TrainJobNotExistError( 'No summary file found in %s, explain job will be delete.' % self._loader_info['summary_dir']) is_end = False while not is_end: is_clean, is_end, event_dict = self._parser.parse_explain( filenames) if is_clean: logger.info( 'Summary file in %s update, reload the data in the summary.', self._loader_info['summary_dir']) self._clear_job() if event_dict: self._import_data_from_event(event_dict)
def _repeat_loading(self, repeat_interval): """Periodically loading summary.""" while True: try: logger.info('Start to load data, repeat interval: %r.', repeat_interval) self._load_data() if not repeat_interval: return time.sleep(repeat_interval) except UnknownError as ex: logger.error( 'Unexpected error happens when loading data. Loading status: %s, loading pool size: %d' 'Detail: %s', self._loading_status, len(self._loader_pool), str(ex))
def _stop_load_data(self): """Stop loading data, status changes to Stopping.""" if self.status != _ExplainManagerStatus.LOADING.value: return logger.info('Start to stop loading data, set status to %s.', _ExplainManagerStatus.STOPPING.value) self.status = _ExplainManagerStatus.STOPPING.value for loader in self._loader_pool.values(): loader.stop() while self.status != _ExplainManagerStatus.DONE.value: continue logger.info('Stop loading data end.')
def _import_data_from_event(self, event_dict: Dict): """Parse and import data from the event data.""" if 'metadata' not in event_dict and self._is_metadata_empty(): raise ParamValueError( 'metadata is incomplete, should write metadata first in the summary.' ) for tag, event in event_dict.items(): if tag == ExplainFieldsEnum.METADATA.value: self._import_metadata_from_event(event.metadata) elif tag == ExplainFieldsEnum.BENCHMARK.value: self._import_benchmark_from_event(event.benchmark) elif tag == ExplainFieldsEnum.SAMPLE_ID.value: self._import_sample_from_event(event) else: logger.info('Unknown ExplainField: %s.', tag)
def _execute_loading(self): """Execute the data loading.""" # We will load the newest loader first. for loader_id in list(self._loader_pool.keys())[::-1]: with self._loader_pool_mutex: loader = self._loader_pool.get(loader_id, None) if loader is None: logger.debug( 'Loader %r has been deleted, will not load data.', loader_id) continue if self.status == _ExplainManagerStatus.STOPPING.value: logger.info('Loader %s status is %s, will return.', loader_id, loader.status) return loader.load()
def _repeat_loading(self, repeat_interval): """Periodically loading summary.""" # Allocate CPU resources to enable gunicorn to start the web service. time.sleep(1) while True: try: if self.status == _ExplainManagerStatus.STOPPING.value: logger.debug( 'Current loading status is %s, we will not trigger repeat loading.', _ExplainManagerStatus.STOPPING.value) else: logger.info( 'Starts triggering repeat loading, repeat interval: %r.', repeat_interval) self._load_data() if not repeat_interval: return time.sleep(repeat_interval) except UnknownError as ex: logger.error( 'Unexpected error happens when loading data. Loading status: %s, loading pool size: %d' 'Detail: %s', self.status, len(self._loader_pool), str(ex))
def _load_data(self): """ Prepare loaders in cache and start loading the data from summaries. Only a limited number of loaders will be cached in terms of updated_time or query_time. The size of cache pool is determined by _MAX_LOADERS_NUM. When the manager start loading data, only the lastest _MAX_LOADER_NUM summaries will be loaded in cache. If a cached loader if queries by 'get_job', the query_time of the loader will be updated as well as the the loader moved to the end of cache. If an uncached summary is queried, a new loader instance will be generated and put to the end cache. """ try: with self._status_mutex: if self._loading_status == _ExplainManagerStatus.LOADING.value: logger.info( 'Current status is %s, will ignore to load data.', self._loading_status) return self._loading_status = _ExplainManagerStatus.LOADING.value self._cache_loaders() self._execute_loading() if not self._loader_pool: self._loading_status = _ExplainManagerStatus.INVALID.value else: self._loading_status = _ExplainManagerStatus.DONE.value logger.info( 'Load event data end, status: %s, and loader pool size: %d', self._loading_status, len(self._loader_pool)) except Exception as ex: self._loading_status = _ExplainManagerStatus.INVALID.value logger.exception(ex) raise UnknownError(str(ex))
def parse_explain(self, filenames): """ Load summary file and parse file content. Args: filenames (list[str]): File name list. Returns: bool, True if all the summary files are finished loading. """ summary_files = self.sort_files(filenames) is_end = False is_clean = False event_data = {} filename = summary_files[-1] file_path = FileHandler.join(self._summary_dir, filename) if filename != self._latest_filename: self._summary_file_handler = FileHandler(file_path, 'rb') self._latest_filename = filename self._latest_file_size = 0 is_clean = True new_size = FileHandler.file_stat(file_path).size if new_size == self._latest_file_size: is_end = True return is_clean, is_end, event_data while True: start_offset = self._summary_file_handler.offset try: event_str = self.event_load(self._summary_file_handler) if event_str is None: self._summary_file_handler.reset_offset(start_offset) is_end = True return is_clean, is_end, event_data if len(event_str) > MAX_EVENT_STRING: logger.warning( "file_path: %s, event string: %d exceeds %d and drop it.", self._summary_file_handler.file_path, len(event_str), MAX_EVENT_STRING) continue field_list, tensor_value_list = self._event_decode(event_str) for field, tensor_value in zip(field_list, tensor_value_list): event_data[field] = tensor_value logger.info("Parse summary file offset %d, file path: %s.", self._summary_file_handler.offset, file_path) return is_clean, is_end, event_data except (exceptions.CRCFailedError, exceptions.CRCLengthFailedError) as ex: self._summary_file_handler.reset_offset(start_offset) is_end = True logger.warning( "Check crc failed and ignore this file, file_path=%s, offset=%s. Detail: %r.", self._summary_file_handler.file_path, self._summary_file_handler.offset, str(ex)) return is_clean, is_end, event_data except (OSError, DecodeError, exceptions.MindInsightException) as ex: is_end = True logger.warning( "Parse log file fail, and ignore this file, detail: %r," "file path: %s.", str(ex), self._summary_file_handler.file_path) return is_clean, is_end, event_data except Exception as ex: logger.exception(ex) raise UnknownError(str(ex))