def handle_arguments(getinfo): """Check for invalid arguments and get controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): controller options """ options = parse_args(getinfo['searchinfo']['args']) params = options.get('params', {}) collection_name = params.get('collection_name') experiment_id = params.get('experiment_id') if collection_name is None and experiment_id is None: raise RuntimeError( 'You must provide a KVStore collection name (collection_name=...) or an Experiment id (experiment_id=...)' ) controller_options = parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options['processor'] = 'KVStoreLookupProcessor' controller_options['collection_name'] = collection_name controller_options['experiment_id'] = experiment_id return controller_options
def handle_arguments(getinfo): """Check for invalid arguments and get controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): controller options """ if len(getinfo['searchinfo']['args']) == 0: raise RuntimeError('First argument must be a KVStore collection name') controller_options = param_util.parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options['namespace'], controller_options['collection_name'] = param_util.parse_namespace_model_name(getinfo['searchinfo']['args'][0]) controller_options['processor'] = 'KVStoreLookupProcessor' return controller_options
def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata from first chunk Returns: controller_options (dict): options to be passed to controller """ if len(getinfo['searchinfo']['raw_args']) == 0: raise RuntimeError('First argument must be a scoring method') raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options = ScoreCommand.handle_raw_options(raw_options) controller_options['scoring_name'] = getinfo['searchinfo']['args'][0] return controller_options
def handle_arguments(getinfo): """Catch invalid argument and return controller options. Args: getinfo (dict): getinfo metadata Return: controller_options (dict): controller options """ if len(getinfo['searchinfo']['args']) == 0: raise RuntimeError('First argument must be a saved model') controller_options = param_util.parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options['namespace'], controller_options['model_name'] = \ param_util.parse_namespace_model_name(getinfo['searchinfo']['args'][0]) controller_options['processor'] = 'SummaryProcessor' return controller_options
def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata from first chunk Returns: controller_options (dict): options to be passed to controller partial_fit (bool): boolean flag to indicate partial fit """ if len(getinfo['searchinfo']['raw_args']) == 0: raise RuntimeError('First argument must be an "algorithm"') raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options, partial_fit = FitCommand.handle_raw_options( raw_options) controller_options['algo_name'] = getinfo['searchinfo']['args'][0] return controller_options, partial_fit
def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): options to be sent to controller """ if len(getinfo['searchinfo']['args']) == 0: raise RuntimeError('First argument must be a saved model.') raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options = ApplyCommand.handle_raw_options(raw_options) controller_options['namespace'], controller_options[ 'model_name'] = parse_namespace_model_name( getinfo['searchinfo']['args'][0]) return controller_options
def handle_arguments(getinfo): """Take the getinfo metadata and return controller_options. Args: getinfo (dict): getinfo metadata Returns: controller_options (dict): options to be sent to controller """ if len(getinfo['searchinfo']['args']) == 0: raise RuntimeError('First argument must be a saved model.') raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:]) controller_options = ApplyCommand.handle_raw_options(raw_options) controller_options['namespace'], controller_options['model_name'] = parse_namespace_model_name(getinfo['searchinfo']['args'][0]) searchinfo = getinfo['searchinfo'] getinfo['searchinfo'] = add_distributed_search_info(process_options=None, searchinfo=searchinfo) controller_options['mlspl_conf'] = MLSPLConf(getinfo['searchinfo']) return controller_options
def main(): out_metadata = {} random_seed = None sample_mode = None # ratio, count, proportional, partition sample_ratio = None sample_count = None sample_proportional_field = None sample_inverse = False sample_partitions = None sample_partition_fieldname = None sample_split_by_field = None # Phase 0: getinfo exchange metadata, body = read_chunk(sys.stdin) # parse args options = parse_args(metadata['searchinfo']['args']) if 'params' in options: try: params = convert_params(options.get('params', {}), ints=['seed', 'count', 'partitions'], floats=['ratio'], strs=['fieldname', 'proportional'], aliases={}) except RuntimeError as e: log_and_die(out_metadata, str(e)) if 'seed' in params: random_seed = params['seed'] if random_seed < 0: log_and_die(out_metadata, "Random seed must not be negative.") else: random.seed(random_seed) else: random.seed() if 'ratio' in params: sample_mode = "ratio" sample_ratio = params['ratio'] if sample_ratio < 0 or sample_ratio > 1: log_and_die( out_metadata, "Sampling ratio must be a valid probability (i.e., in the interval [0,1])." ) if 'count' in params: if sample_mode is not None: log_and_die(out_metadata, "More than one sampling mode specified.") else: sample_mode = "count" sample_count = params['count'] if sample_count < 1: log_and_die(out_metadata, "Sample count must be one or greater.") if 'proportional' in params: if sample_mode is not None: log_and_die(out_metadata, "More than one sampling mode specified.") else: sample_mode = "proportional" sample_proportional_field = params['proportional'] if 'partitions' in params: if sample_mode is not None: log_and_die(out_metadata, "More than one sampling mode specified.") else: sample_mode = "partition" sample_partitions = params['partitions'] if sample_partitions < 2: log_and_die(out_metadata, "Must specify two or more partitions.") if sample_mode == "partition": if 'fieldname' in params: sample_partition_fieldname = params['fieldname'] else: sample_partition_fieldname = 'partition_number' if 'fieldname' in params and sample_mode != "partition": log_and_die( out_metadata, "Only partition mode supports the fieldname parameter.") if 'args' in options: args = options['args'] for arg in args: try: sample_arg = float(arg) if sample_arg > 0 and sample_arg < 1: if sample_mode is not None: log_and_die(out_metadata, "More than one sampling mode specified.") sample_ratio = sample_arg sample_mode = 'ratio' elif sample_arg >= 1 and sample_arg.is_integer(): if sample_mode is not None: log_and_die(out_metadata, "More than one sampling mode specified.") sample_count = sample_arg sample_mode = 'count' else: log_and_die( out_metadata, "Must specify either a number between 0 and 1, non-inclusive (sampling probability) or an integer greater than or equal to 1 (count of events)." ) except ValueError: if arg != 'inverse' and arg != 'fieldname': log_and_die(out_metadata, "Unrecognized argument: %s" % arg) if 'inverse' in args: if sample_mode == 'proportional': sample_inverse = True else: log_and_die( out_metadata, "Only proportional mode supports the inverse parameter.") if 'split_by' in options: if sample_mode == 'count': try: sample_split_by_field = options['split_by'] if sample_split_by_field == "": log_and_die(out_metadata, "Split-by field name is an empty string.") except ValueError: log_and_die(out_metadata, "Failed to parse split-by clause.") else: log_and_die(out_metadata, "A by clause can only be used in count mode.") if sample_mode == 'count': capdata = {'type': 'events'} else: capdata = {'type': 'stateful'} write_chunk(sys.stdout, capdata, '') # need to buffer events for all modes because we need all the field names if sample_split_by_field is None: event_reservoir = [] else: event_reservoir = {} per_by_value_index = {} global_index = 0 missing_split_by_field = 0 # Phase 1: sample the events as they come in while True: ret = read_chunk(sys.stdin) if not ret: break metadata, body = ret out_metadata = {} out_metadata['finished'] = False outbuf = StringIO.StringIO() field_names = set() last_index = 0 reader = csv.DictReader(body.splitlines(), dialect='excel') for index, record in enumerate(reader): # RATIO MODE if sample_mode == "ratio": if random.random() <= sample_ratio: event_reservoir.append(record) # COUNT MODE # Uses reservoir sampling: https://en.wikipedia.org/wiki/Reservoir_sampling#Example_implementation elif sample_mode == "count": gindex = index + global_index if sample_split_by_field is not None: if sample_split_by_field in record and record[ sample_split_by_field] != "": by_value = record[sample_split_by_field] else: by_value = 'NULL_BY_VALUE' missing_split_by_field += 1 eres = event_reservoir.setdefault(by_value, []) gindex = per_by_value_index.get(by_value, 0) if len(per_by_value_index) > BY_VALUE_MAX: log_and_die( out_metadata, "Too many values (> %d) for split-by field %s." % (BY_VALUE_MAX, sample_split_by_field)) per_by_value_index[by_value] = gindex + 1 else: eres = event_reservoir if gindex < sample_count: eres.append({'gindex': gindex, 'record': record}) else: r = random.randint(0, gindex) if r < sample_count: eres[r] = {'gindex': gindex, 'record': record} # PROPORTIONAL MODE elif sample_mode == "proportional": if sample_proportional_field not in record: log_and_die( out_metadata, "The specified field for proportional sampling does not exist: %s" % sample_partition_fieldname) try: sample_proportional_val = float( record[sample_proportional_field]) except ValueError: log_and_die( out_metadata, "The specified field for proportional sampling (%s) contains a non-numeric value: %s." % (sample_proportional_field, record[sample_proportional_field])) if sample_proportional_val < 0 or sample_proportional_val > 1: log_and_die( out_metadata, "The field to use for proportional sampling must be a valid probability (i.e., between 0 and 1). Received %f." % sample_proportional_val) if sample_inverse: sample_proportional_val = 1 - sample_proportional_val if random.random() <= sample_proportional_val: event_reservoir.append(record) # PARTITION MODE elif sample_mode == "partition": p = random.randint(0, sample_partitions - 1) if sample_partition_fieldname is None: sample_partition_fieldname = 'partition_number' if sample_partition_fieldname in record: log_and_die( out_metadata, "The specified field name for the partition already exists: %s" % sample_partition_fieldname) else: record[sample_partition_fieldname] = p event_reservoir.append(record) else: log_and_die( out_metadata, "Invalid sampling mode specified: %s" % sample_mode) last_index = index # we do this at the end so any added fields are included field_names = field_names.union(set(record.keys())) # finished reading the chunk; do any per-chunk actions global_index = global_index + last_index + 1 if sample_mode != 'count': writer = csv.DictWriter(outbuf, fieldnames=list(field_names), dialect='excel', extrasaction='ignore') writer.writeheader() for event in event_reservoir: writer.writerow(event) write_chunk(sys.stdout, out_metadata, outbuf.getvalue()) event_reservoir = [] field_names = set() else: write_chunk(sys.stdout, {"finished": False}, '') if metadata.get('finished', False): break # Phase 2: output (count mode) and wrap-up if sample_mode == 'count': if sample_split_by_field is not None: merged_event_reservoir = [] for by_value in event_reservoir: merged_event_reservoir.extend(event_reservoir[by_value]) event_reservoir = sorted(merged_event_reservoir, key=lambda val: val['gindex']) else: event_reservoir = sorted(event_reservoir, key=lambda val: val['gindex']) # loop over CHUNK_SIZE slices of event_reservoir num_chunks = int(math.ceil(len(event_reservoir) / float(CHUNK_SIZE))) for i_chunk in range(num_chunks): if not read_chunk(sys.stdin): break outbuf = StringIO.StringIO() writer = csv.DictWriter(outbuf, fieldnames=list(field_names), dialect='excel', extrasaction='ignore') writer.writeheader() for val in event_reservoir[i_chunk * CHUNK_SIZE:i_chunk * CHUNK_SIZE + CHUNK_SIZE]: writer.writerow(val['record']) write_chunk(sys.stdout, {"finished": False}, outbuf.getvalue()) # we're done, so send final response to finish the session ret = read_chunk(sys.stdin) if ret: out_metadata = {} out_metadata['finished'] = True if missing_split_by_field > 0: log_and_warn( out_metadata, "%d events (out of %d) were missing the %s field and were sampled as though they all had the same value." % (missing_split_by_field, global_index, sample_split_by_field)) write_chunk(sys.stdout, out_metadata, '')