def handle_arguments(getinfo):
        """Check for invalid arguments and get controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): controller options
        """

        options = parse_args(getinfo['searchinfo']['args'])
        params = options.get('params', {})

        collection_name = params.get('collection_name')
        experiment_id = params.get('experiment_id')

        if collection_name is None and experiment_id is None:
            raise RuntimeError(
                'You must provide a KVStore collection name (collection_name=...) or an Experiment id (experiment_id=...)'
            )

        controller_options = parse_args(getinfo['searchinfo']['raw_args'][1:])

        controller_options['processor'] = 'KVStoreLookupProcessor'
        controller_options['collection_name'] = collection_name
        controller_options['experiment_id'] = experiment_id

        return controller_options
Example #2
0
    def handle_arguments(getinfo):
        """Check for invalid arguments and get controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): controller options
        """
        if len(getinfo['searchinfo']['args']) == 0:
            raise RuntimeError('First argument must be a KVStore collection name')

        controller_options = param_util.parse_args(getinfo['searchinfo']['raw_args'][1:])
        controller_options['namespace'], controller_options['collection_name'] = param_util.parse_namespace_model_name(getinfo['searchinfo']['args'][0])
        controller_options['processor'] = 'KVStoreLookupProcessor'
        return controller_options
Example #3
0
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata from first chunk

        Returns:
            controller_options (dict): options to be passed to controller
        """
        if len(getinfo['searchinfo']['raw_args']) == 0:
            raise RuntimeError('First argument must be a scoring method')

        raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:])
        controller_options = ScoreCommand.handle_raw_options(raw_options)
        controller_options['scoring_name'] = getinfo['searchinfo']['args'][0]
        return controller_options
Example #4
0
    def handle_arguments(getinfo):
        """Catch invalid argument and return controller options.

        Args:
            getinfo (dict): getinfo metadata

        Return:
            controller_options (dict): controller options
        """
        if len(getinfo['searchinfo']['args']) == 0:
            raise RuntimeError('First argument must be a saved model')

        controller_options = param_util.parse_args(getinfo['searchinfo']['raw_args'][1:])
        controller_options['namespace'], controller_options['model_name'] = \
            param_util.parse_namespace_model_name(getinfo['searchinfo']['args'][0])
        controller_options['processor'] = 'SummaryProcessor'
        return controller_options
Example #5
0
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata from first chunk

        Returns:
            controller_options (dict): options to be passed to controller
            partial_fit (bool): boolean flag to indicate partial fit
        """
        if len(getinfo['searchinfo']['raw_args']) == 0:
            raise RuntimeError('First argument must be an "algorithm"')

        raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:])
        controller_options, partial_fit = FitCommand.handle_raw_options(
            raw_options)
        controller_options['algo_name'] = getinfo['searchinfo']['args'][0]
        return controller_options, partial_fit
Example #6
0
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): options to be sent to controller
        """
        if len(getinfo['searchinfo']['args']) == 0:
            raise RuntimeError('First argument must be a saved model.')

        raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:])
        controller_options = ApplyCommand.handle_raw_options(raw_options)
        controller_options['namespace'], controller_options[
            'model_name'] = parse_namespace_model_name(
                getinfo['searchinfo']['args'][0])
        return controller_options
Example #7
0
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): options to be sent to controller
        """
        if len(getinfo['searchinfo']['args']) == 0:
            raise RuntimeError('First argument must be a saved model.')

        raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:])
        controller_options = ApplyCommand.handle_raw_options(raw_options)
        controller_options['namespace'], controller_options['model_name'] = parse_namespace_model_name(getinfo['searchinfo']['args'][0])

        searchinfo = getinfo['searchinfo']
        getinfo['searchinfo'] = add_distributed_search_info(process_options=None, searchinfo=searchinfo)
        controller_options['mlspl_conf'] = MLSPLConf(getinfo['searchinfo'])
        return controller_options
Example #8
0
def main():
    out_metadata = {}

    random_seed = None
    sample_mode = None  # ratio, count, proportional, partition
    sample_ratio = None
    sample_count = None
    sample_proportional_field = None
    sample_inverse = False
    sample_partitions = None
    sample_partition_fieldname = None
    sample_split_by_field = None

    # Phase 0: getinfo exchange

    metadata, body = read_chunk(sys.stdin)

    # parse args
    options = parse_args(metadata['searchinfo']['args'])

    if 'params' in options:
        try:
            params = convert_params(options.get('params', {}),
                                    ints=['seed', 'count', 'partitions'],
                                    floats=['ratio'],
                                    strs=['fieldname', 'proportional'],
                                    aliases={})
        except RuntimeError as e:
            log_and_die(out_metadata, str(e))

        if 'seed' in params:
            random_seed = params['seed']

            if random_seed < 0:
                log_and_die(out_metadata, "Random seed must not be negative.")
            else:
                random.seed(random_seed)
        else:
            random.seed()

        if 'ratio' in params:
            sample_mode = "ratio"

            sample_ratio = params['ratio']

            if sample_ratio < 0 or sample_ratio > 1:
                log_and_die(
                    out_metadata,
                    "Sampling ratio must be a valid probability (i.e., in the interval [0,1])."
                )

        if 'count' in params:
            if sample_mode is not None:
                log_and_die(out_metadata,
                            "More than one sampling mode specified.")
            else:
                sample_mode = "count"

            sample_count = params['count']

            if sample_count < 1:
                log_and_die(out_metadata,
                            "Sample count must be one or greater.")

        if 'proportional' in params:
            if sample_mode is not None:
                log_and_die(out_metadata,
                            "More than one sampling mode specified.")
            else:
                sample_mode = "proportional"

            sample_proportional_field = params['proportional']

        if 'partitions' in params:
            if sample_mode is not None:
                log_and_die(out_metadata,
                            "More than one sampling mode specified.")
            else:
                sample_mode = "partition"

            sample_partitions = params['partitions']

            if sample_partitions < 2:
                log_and_die(out_metadata,
                            "Must specify two or more partitions.")

        if sample_mode == "partition":
            if 'fieldname' in params:
                sample_partition_fieldname = params['fieldname']
            else:
                sample_partition_fieldname = 'partition_number'

        if 'fieldname' in params and sample_mode != "partition":
            log_and_die(
                out_metadata,
                "Only partition mode supports the fieldname parameter.")

    if 'args' in options:
        args = options['args']
        for arg in args:
            try:
                sample_arg = float(arg)

                if sample_arg > 0 and sample_arg < 1:
                    if sample_mode is not None:
                        log_and_die(out_metadata,
                                    "More than one sampling mode specified.")
                    sample_ratio = sample_arg
                    sample_mode = 'ratio'
                elif sample_arg >= 1 and sample_arg.is_integer():
                    if sample_mode is not None:
                        log_and_die(out_metadata,
                                    "More than one sampling mode specified.")
                    sample_count = sample_arg
                    sample_mode = 'count'
                else:
                    log_and_die(
                        out_metadata,
                        "Must specify either a number between 0 and 1, non-inclusive (sampling probability) or an integer greater than or equal to 1 (count of events)."
                    )

            except ValueError:
                if arg != 'inverse' and arg != 'fieldname':
                    log_and_die(out_metadata,
                                "Unrecognized argument: %s" % arg)

        if 'inverse' in args:
            if sample_mode == 'proportional':
                sample_inverse = True
            else:
                log_and_die(
                    out_metadata,
                    "Only proportional mode supports the inverse parameter.")

    if 'split_by' in options:
        if sample_mode == 'count':
            try:
                sample_split_by_field = options['split_by']

                if sample_split_by_field == "":
                    log_and_die(out_metadata,
                                "Split-by field name is an empty string.")

            except ValueError:
                log_and_die(out_metadata, "Failed to parse split-by clause.")
        else:
            log_and_die(out_metadata,
                        "A by clause can only be used in count mode.")

    if sample_mode == 'count':
        capdata = {'type': 'events'}
    else:
        capdata = {'type': 'stateful'}

    write_chunk(sys.stdout, capdata, '')

    # need to buffer events for all modes because we need all the field names
    if sample_split_by_field is None:
        event_reservoir = []
    else:
        event_reservoir = {}
        per_by_value_index = {}

    global_index = 0
    missing_split_by_field = 0

    # Phase 1: sample the events as they come in

    while True:
        ret = read_chunk(sys.stdin)
        if not ret:
            break
        metadata, body = ret

        out_metadata = {}
        out_metadata['finished'] = False
        outbuf = StringIO.StringIO()

        field_names = set()
        last_index = 0

        reader = csv.DictReader(body.splitlines(), dialect='excel')
        for index, record in enumerate(reader):

            # RATIO MODE
            if sample_mode == "ratio":
                if random.random() <= sample_ratio:
                    event_reservoir.append(record)

            # COUNT MODE
            # Uses reservoir sampling: https://en.wikipedia.org/wiki/Reservoir_sampling#Example_implementation
            elif sample_mode == "count":
                gindex = index + global_index
                if sample_split_by_field is not None:
                    if sample_split_by_field in record and record[
                            sample_split_by_field] != "":
                        by_value = record[sample_split_by_field]
                    else:
                        by_value = 'NULL_BY_VALUE'
                        missing_split_by_field += 1

                    eres = event_reservoir.setdefault(by_value, [])
                    gindex = per_by_value_index.get(by_value, 0)

                    if len(per_by_value_index) > BY_VALUE_MAX:
                        log_and_die(
                            out_metadata,
                            "Too many values (> %d) for split-by field %s." %
                            (BY_VALUE_MAX, sample_split_by_field))

                    per_by_value_index[by_value] = gindex + 1
                else:
                    eres = event_reservoir

                if gindex < sample_count:
                    eres.append({'gindex': gindex, 'record': record})
                else:
                    r = random.randint(0, gindex)
                    if r < sample_count:
                        eres[r] = {'gindex': gindex, 'record': record}

            # PROPORTIONAL MODE
            elif sample_mode == "proportional":
                if sample_proportional_field not in record:
                    log_and_die(
                        out_metadata,
                        "The specified field for proportional sampling does not exist: %s"
                        % sample_partition_fieldname)

                try:
                    sample_proportional_val = float(
                        record[sample_proportional_field])
                except ValueError:
                    log_and_die(
                        out_metadata,
                        "The specified field for proportional sampling (%s) contains a non-numeric value: %s."
                        % (sample_proportional_field,
                           record[sample_proportional_field]))

                if sample_proportional_val < 0 or sample_proportional_val > 1:
                    log_and_die(
                        out_metadata,
                        "The field to use for proportional sampling must be a valid probability (i.e., between 0 and 1). Received %f."
                        % sample_proportional_val)

                if sample_inverse:
                    sample_proportional_val = 1 - sample_proportional_val

                if random.random() <= sample_proportional_val:
                    event_reservoir.append(record)

            # PARTITION MODE
            elif sample_mode == "partition":
                p = random.randint(0, sample_partitions - 1)

                if sample_partition_fieldname is None:
                    sample_partition_fieldname = 'partition_number'

                if sample_partition_fieldname in record:
                    log_and_die(
                        out_metadata,
                        "The specified field name for the partition already exists: %s"
                        % sample_partition_fieldname)
                else:
                    record[sample_partition_fieldname] = p
                    event_reservoir.append(record)
            else:
                log_and_die(
                    out_metadata,
                    "Invalid sampling mode specified: %s" % sample_mode)

            last_index = index
            # we do this at the end so any added fields are included
            field_names = field_names.union(set(record.keys()))

        # finished reading the chunk; do any per-chunk actions
        global_index = global_index + last_index + 1

        if sample_mode != 'count':
            writer = csv.DictWriter(outbuf,
                                    fieldnames=list(field_names),
                                    dialect='excel',
                                    extrasaction='ignore')
            writer.writeheader()

            for event in event_reservoir:
                writer.writerow(event)

            write_chunk(sys.stdout, out_metadata, outbuf.getvalue())
            event_reservoir = []
            field_names = set()
        else:
            write_chunk(sys.stdout, {"finished": False}, '')

        if metadata.get('finished', False):
            break

    # Phase 2: output (count mode) and wrap-up

    if sample_mode == 'count':
        if sample_split_by_field is not None:
            merged_event_reservoir = []

            for by_value in event_reservoir:
                merged_event_reservoir.extend(event_reservoir[by_value])

            event_reservoir = sorted(merged_event_reservoir,
                                     key=lambda val: val['gindex'])
        else:
            event_reservoir = sorted(event_reservoir,
                                     key=lambda val: val['gindex'])

        # loop over CHUNK_SIZE slices of event_reservoir
        num_chunks = int(math.ceil(len(event_reservoir) / float(CHUNK_SIZE)))

        for i_chunk in range(num_chunks):
            if not read_chunk(sys.stdin):
                break

            outbuf = StringIO.StringIO()
            writer = csv.DictWriter(outbuf,
                                    fieldnames=list(field_names),
                                    dialect='excel',
                                    extrasaction='ignore')
            writer.writeheader()

            for val in event_reservoir[i_chunk *
                                       CHUNK_SIZE:i_chunk * CHUNK_SIZE +
                                       CHUNK_SIZE]:
                writer.writerow(val['record'])

            write_chunk(sys.stdout, {"finished": False}, outbuf.getvalue())

    # we're done, so send final response to finish the session
    ret = read_chunk(sys.stdin)
    if ret:
        out_metadata = {}
        out_metadata['finished'] = True

        if missing_split_by_field > 0:
            log_and_warn(
                out_metadata,
                "%d events (out of %d) were missing the %s field and were sampled as though they all had the same value."
                %
                (missing_split_by_field, global_index, sample_split_by_field))

        write_chunk(sys.stdout, out_metadata, '')