def load_sampler_limits(process_options, stanza_name, mlspl_conf): """Read sampling limits from conf file and decide sample count. Args: process_options (dict): process options stanza_name (str): algo/scorer stanza name in mlspl.conf mlspl_conf (obj): the conf utility for mlspl conf settings Returns: sampler_limits (dict): sampler limits """ max_inputs = int(mlspl_conf.get_mlspl_prop('max_inputs', stanza_name, -1)) sampler_limits = { 'use_sampling': is_truthy( str(mlspl_conf.get_mlspl_prop('use_sampling', stanza_name, 'yes'))), 'sample_seed': process_options['sample_seed'] # simply set sample seed } # setting up the logic to choose the sample count if process_options['sample_count']: sampler_limits['sample_count'] = min(process_options['sample_count'], max_inputs) else: sampler_limits['sample_count'] = max_inputs return sampler_limits
def check_probabilities(options): out_params = convert_params(options.get('params', {}), bools=['probabilities'], ignore_extra=True) if 'probabilities' in out_params: probabilities = is_truthy(out_params['probabilities']) del options['params']['probabilities'] else: probabilities = False return probabilities
def check_sampler(sampler_limits, algo_name): """Inform user if sampling is on or raise error if sampling is off and events exceed limit. Args: sampler_limits (dict): sampler limits algo_name (str): algo name """ if is_truthy(sampler_limits['use_sampling']): messages.warn( 'Input event count exceeds max_inputs for %s (%d), model will be fit on a sample of events.' % ( algo_name, sampler_limits['sample_count'])) else: raise RuntimeError('Input event count exceeds max_inputs for %s (%d) and sampling is disabled.' % ( algo_name, sampler_limits['sample_count']))
def load_resource_limits(algo_name, process_options): """Load algorithm-specific limits. Args: algo_name (str): algorithm name process_options (dict): the mlspl limits from the conf files Returns: resource_limits (dict): dictionary of resource limits """ resource_limits = {} limits = process_options['mlspl_limits'] resource_limits['max_memory_usage_mb'] = int( limits.get('max_memory_usage_mb', -1)) resource_limits['streaming_apply'] = is_truthy( limits.get('streaming_apply', False)) return resource_limits
def tree_summary(algo, options=None): """Create summary for tree based models. Args: algo (object): an algo object options (dict): options Returns: (dataframe): dataframe representation of the tree summary """ if options: out_params = convert_params(options.get('params', {}), ints=["limit"], bools=["json"]) if "json" in out_params: return_json = out_params["json"] if "limit" in out_params: depth_limit = out_params["limit"] if 'return_json' not in locals(): return_json = is_truthy( conf.get_mlspl_prop('summary_return_json', algo.__class__.__name__, 'f')) if 'depth_limit' not in locals(): depth_limit = int( conf.get_mlspl_prop('summary_depth_limit', algo.__class__.__name__, -1)) if depth_limit <= 0: raise ValueError( 'Limit = %d. Value for limit should be greater than 0.' % depth_limit) root = 0 depth = 0 if return_json: output_data = [ json.dumps(tree_summary_dict(algo, depth_limit, root, depth), sort_keys=True) ] else: output_data = tree_summary_str(algo, depth_limit, root, depth) return pd.DataFrame({'Decision Tree Summary': output_data})
def load_sampler_limits(process_options, algo_name): """Read sampling limits from conf file and decide sample count. Args: process_options (dict): process options algo_name (str): algo name Returns: sampler_limits (dict): sampler limits """ sampler_limits = {} # setting up the logic to choose the sample count sampler_limits['use_sampling'] = is_truthy(str(conf.get_mlspl_prop('use_sampling', algo_name, 'yes'))) max_inputs = int(conf.get_mlspl_prop('max_inputs', algo_name, -1)) if process_options['sample_count']: sampler_limits['sample_count'] = min(process_options['sample_count'], max_inputs) else: sampler_limits['sample_count'] = max_inputs # simply set sample seed sampler_limits['sample_seed'] = process_options['sample_seed'] return sampler_limits
def setup(self): """Parse search string, choose processor, initialize controller. Returns: (dict): get info response (command type) and required fields. This response will be sent back to the CEXC process on the getinfo exchange (first chunk) to establish our execution type and required fields. """ self.controller_options = self.handle_arguments(self.getinfo) self.controller = ChunkedController(self.getinfo, self.controller_options) self.watchdog = command_util.get_watchdog( time_limit=-1, memory_limit=self.controller.resource_limits['max_memory_usage_mb'] ) streaming_apply = is_truthy( conf.get_mlspl_prop('streaming_apply', default='f')) exec_type = 'streaming' if streaming_apply else 'stateful' required_fields = self.controller.get_required_fields() return {'type': exec_type, 'required_fields': required_fields}
# Our final farewell self.log_performance_timers() return ({'finished': finished_flag}, output_body) def log_performance_timers(self): logger.debug( "command=fit, read_time=%f, handle_time=%f, write_time=%f, csv_parse_time=%f, csv_render_time=%f" % (self._read_time, self._handle_time, self._write_time, self.controller._csv_parse_time, self.controller._csv_render_time)) if __name__ == "__main__": logger.debug("Starting fit.py.") do_profile = is_truthy(conf.get_mlspl_prop('profile', 'default', 'n')) if do_profile: import cProfile import pstats pr = cProfile.Profile() pr.enable() FitCommand(handler_data=BaseChunkHandler.DATA_RAW).run() if do_profile: pr.disable() s = StringIO() ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') ps.print_stats(10)