Ejemplo n.º 1
0
def _optimize_loss(probe_counts,
                   loss_fn,
                   bounds,
                   x0,
                   initial_eps=10.0,
                   step_size=0.001,
                   interp_fn_type='standard'):
    """Optimize loss function with barrier.

    This uses scipy's optimize.fmin_tnc to minimize the loss function
    in which the barrier is weighted by eps. It repeatedly minimizes
    the loss while decreasing eps so that, by the last iteration, the
    weight on the barrier is very small. On each iteration, it starts
    the initial guess/position at the solution to the previous iteration.

    Args:
        probe_counts: dict giving number of probes required for each
            dataset and choice of parameters
        loss_fn: the loss function provided by _make_loss_fn
        bounds: bounds on the parameter values provided by _make_param_bounds_*
        x0: the initial guess of parameter values (i.e., starting position)
        initial_eps: weight of the barrier on the first iteration
        step_size: epsilon value provided to optimize.fmin_tnc
        interp_fn_type: 'standard' (only perform interpolation on mismatches
            and cover_extension parameters) or 'nd' (use scipy's interpolate
            package to interpolate over n-dimensions)

    Returns:
        list of length (number of datasets)*(number of parameters) where
        x_i is the (i % N)'th parameter of the (i/N)'th dataset,
        for i=0,1,2,... where N=(number of datasets)
    """
    eps = initial_eps
    while eps >= 0.01:
        x0_probe_count = ic._make_total_probe_count_across_datasets_fn(
            probe_counts, interp_fn_type=interp_fn_type)(x0)
        logger.info(("Starting an iteration with eps=%f, with x0 yielding %f "
                     "probes"), eps, x0_probe_count)

        sol, nfeval, rc = optimize.fmin_tnc(loss_fn,
                                            x0,
                                            bounds=bounds,
                                            args=(eps, ),
                                            approx_grad=True,
                                            epsilon=step_size,
                                            disp=1,
                                            maxfun=2500)

        if rc in [0, 1, 2]:
            # rc == 0 indicates reaching the local minimum, and rc == 1 or
            # rc == 2 indicates the function value converged
            logger.info("  Iteration was successful")
        else:
            logger.info("  Iteration failed to converge!")

        x0 = sol
        eps = 0.1 * eps

    return sol
Ejemplo n.º 2
0
def higher_dimensional_search(param_names, probe_counts, max_total_count,
        loss_coeffs=None, dataset_weights=None):
    """Search over multiple arbitrary parameters.

    Unlike the standard search, this can search over any number of
    provided parameters. It interpolates linearly using a function
    from scipy for unstructured data of an arbitrary dimension. Unlike
    the standard search, this does not round values (e.g., they may
    remain fractional even if the parameter is integral).

    Args:
        param_names: tuple giving names of parameters
        probe_counts: dict giving number of probes required for each
            dataset and choice of parameters (the tuple specifying
            a choice of parameter values is ordered such that parameters
            correspond to those in param_names)
        max_total_count: upper bound on the number of total probes
        loss_coeffs: coefficient to use for each parameter in the loss
            function. If not set, default is 1 for each parameter
        dataset_weights: dict giving weight in the loss function for each
            dataset; if not set, default is a weight of 1 for each dataset

    Returns:
        tuple (x, y, z) where:
            x is a dict {dataset: p} where p is a tuple giving optimal
                values for parameters, corresponding to the order in
                param_names
            y is the total number of probes required with the parameters in x
            z is the loss for the parameter values in x
    """
    num_params = len(param_names)

    # Set default values for arguments provided as None
    if loss_coeffs is None:
        # The default coefficient is 1 for each parameter
        loss_coeffs = tuple(1.0 for _ in range(num_params))
    else:
        # There must be a coefficient for each parameter
        assert len(loss_coeffs) == num_params
        loss_coeffs = tuple(loss_coeffs)
    if dataset_weights:
        # There should be a weight for each dataset
        for d in probe_counts.keys():
            assert d in dataset_weights
    else:
        dataset_weights = {d: 1.0 for d in probe_counts.keys()}

    # Setup the loss function, parameter bounds, and make an initial guess
    loss_fn = _make_loss_fn(probe_counts, max_total_count, loss_coeffs,
        dataset_weights, interp_fn_type='nd')
    x0 = _make_initial_guess(probe_counts, None, num_params)

    # Find the optimal parameter values, interpolating probe counts
    # for parameter values between what have been explicitly calculated
    bounds = _make_param_bounds_nd(probe_counts)
    x_sol = _optimize_loss(probe_counts, loss_fn, bounds, x0,
        interp_fn_type='nd')

    x_sol_dict = {}
    for i, dataset in enumerate(sorted(probe_counts.keys())):
        x_sol_dict[dataset] = tuple(x_sol[num_params * i + j]
            for j in range(num_params))

    x_sol_count = ic._make_total_probe_count_across_datasets_fn(
        probe_counts, interp_fn_type='nd')(x_sol)
    x_sol_loss = loss_fn(x_sol, 0)

    return (x_sol_dict, x_sol_count, x_sol_loss)
Ejemplo n.º 3
0
def standard_search(probe_counts, max_total_count,
        verify_without_interp=False, round_params=None,
        loss_coeffs=None, dataset_weights=None):
    """Search over mismatches and cover extension only.

    This performs the standard search, which finds optimal values of
    the mismatches and cover extension parameters subject to the
    constraint on the total number of probes. It performs linear
    interpolation between calculated values of these two parameters
    for each dataset. It rounds these parameters to integers or, if
    desired, to the nearest value on an evenly spaced grid.

    Args:
        probe_counts: dict giving number of probes required for each
            dataset and choice of parameters
        max_total_count: upper bound on the number of total probes
        verify_without_interp: if True, check that the total probe count
            calculated without interpolation is the same as that calculated
            after rounding parameter values
        round_params: tuple (m, e); round mismatches to the nearest
            multiple of m and cover_extension to the nearest multiple
            of e (both m and e are int). If not set, default is
            (m, e) = (1, 1).
        loss_coeffs: tuple (m, e) giving the coefficients for the
            mismatches (m) and cover_extension (e) parameters in the
            loss function; if not set, default is (m, e) = (1, 1/100)
        dataset_weights: dict giving weight in the loss function for each
            dataset; if not set, default is a weight of 1 for each dataset

    Returns:
        tuple (x, y, z) where:
            x is a dict {dataset: p} where p is a tuple
                (mismatches, cover_extension) giving optimal values for
                those parameters
            y is the total number of probes required with the parameters in x
            z is the loss for the parameter values in x
    """
    # Set default values for arguments provided as None
    if loss_coeffs:
        # There should be a coefficient for each of the 2 parameters
        assert len(loss_coeffs) == 2
        loss_coeffs = tuple(loss_coeffs)
    else:
        loss_coeffs = (1.0, 1.0/100.0)
    if dataset_weights:
        # There should be a weight for each dataset
        for d in probe_counts.keys():
            assert d in dataset_weights
    else:
        dataset_weights = {d: 1.0 for d in probe_counts.keys()}
    if round_params:
        mismatches_round, cover_extension_round = round_params
    else:
        mismatches_round, cover_extension_round = 1, 1

    # Setup the loss function, parameter bounds, and make an initial guess
    loss_fn = _make_loss_fn(probe_counts, max_total_count, loss_coeffs,
        dataset_weights, interp_fn_type='standard')
    bounds = _make_param_bounds_standard(probe_counts)
    x0 = _make_initial_guess(probe_counts, bounds, 2)

    # Find the optimal parameter values, interpolating probe counts
    # for parameter values between what have been explicitly calculated
    x_sol = _optimize_loss(probe_counts, loss_fn, bounds, x0,
        interp_fn_type='standard')

    # Log the parameter values for each dataset, and the total probe count
    logger.info("##############################")
    logger.info("Continuous parameter values:")
    _log_params_by_dataset(x_sol, probe_counts, "float")
    x_sol_count = ic._make_total_probe_count_across_datasets_fn(
        probe_counts, interp_fn_type='standard')(x_sol)
    logger.info("TOTAL INTERPOLATED PROBE COUNT: %f", x_sol_count)
    logger.info("##############################")

    # Round the interpolated parameter values
    opt_params = _round_params(x_sol, probe_counts, max_total_count,
        loss_coeffs, dataset_weights,
        mismatches_round=mismatches_round,
        cover_extension_round=cover_extension_round)

    # Log the rounded parameter values, the total probe count, and the
    # loss on the rounded values
    logger.info("##############################")
    logger.info("Rounded parameter values:")
    _log_params_by_dataset(opt_params, probe_counts, "int")
    opt_params_count = ic._make_total_probe_count_across_datasets_fn(
        probe_counts, interp_fn_type='standard')(opt_params)
    opt_params_loss = loss_fn(opt_params, 0)
    logger.info("TOTAL PROBE COUNT: %d", opt_params_count)
    logger.info("TOTAL PARAMS LOSS: %f", opt_params_loss)
    logger.info("##############################")

    # Log the total probe count on the rounded parameter values without
    # using interpolation
    if verify_without_interp:
        logger.info("##############################")
        opt_params_count_no_interp = _total_probe_count_without_interp(opt_params, probe_counts)
        logger.info("TOTAL PROBE COUNT WITHOUT INTERP: %d", opt_params_count_no_interp)
        logger.info("##############################")
        # As a sanity check, verify that we get the same total probe count
        # without using interpolation
        assert opt_params_count == opt_params_count_no_interp

    opt_params_dict = {}
    for i, dataset in enumerate(sorted(probe_counts.keys())):
        mismatches = opt_params[2 * i]
        cover_extension = opt_params[2 * i + 1]
        opt_params_dict[dataset] = (mismatches, cover_extension)

    return (opt_params_dict, opt_params_count, opt_params_loss)
Ejemplo n.º 4
0
def _round_params(params, probe_counts, max_total_count, loss_coeffs, weights,
        mismatches_eps=0.01, cover_extension_eps=0.1,
        mismatches_round=1, cover_extension_round=1):
    """Round parameter values while satisfying the constraint on total count.

    This is only applied to the mismatches and cover_extension parameters.

    Parameter values found by the search are floats. We want the mismatches
    and cover_extension parameters to be integers, or to fit on a specified
    grid.

    The floats, as given in params, should satisfy the constraint (i.e.,
    the interpolated total number of probes is less than max_total_count).
    Thus, we can round them up, because (generally) increasing the parameter
    values will decrease the number of probes; therefore, after rounding up
    they should still satisfy the constraint.

    But we also check if the parameter values are within eps of their
    rounded-down value. The loss optimizer has a tendency to make this happen
    for some parameters (e.g., finding an optimal mismatches parameter value
    of 1.00001). The reason likely has to do with the fact that, because
    we are linearly interpolating total probe counts, the gradient of the
    barrier function changes greatly around certain values (i.e., around
    the actual data values). That is, the barrier function is not at all
    smooth around actual data values. This may cause the optimizer to
    yield parameter values that are very close to parameter values for
    which probe counts have actually been computed.

    After rounding up, some parameters are decreased; we repeatedly
    choose to decrease the parameter whose reduction yields the smallest
    loss while still yielding a number of probes that is less than
    max_total_count.

    Args:
        params: parameter values to use when determining probe counts;
            params[2*i] is the number of mismatches of the i'th dataset
            and params[2*i+1] is the cover extension of the i'th dataset
        probe_counts: dict giving number of probes for each dataset and
            choice of parameters
        max_total_count: upper bound on the number of total probes
        loss_coeffs: tuple (m, e) giving the coefficients for the
            mismatches (m) and cover_extension (e) parameters in the
            loss function
        weights: dict giving weight in the loss function for each dataset
        mismatches_eps/cover_extension_eps: eps as defined above for
            mismatches and cover_extension
        mismatches_round/cover_extension_round: round mismatches and
            cover_extension to the nearest multiple of this

    Returns:
        list in which index i corresponds to the parameter given in
        params[i], but rounded
    """
    # This requires that the only two parameters be mismatches and
    # cover_extension
    num_datasets = len(probe_counts)
    assert len(params) == 2*num_datasets
    assert len(loss_coeffs) == 2

    params_rounded = []
    for i, dataset in enumerate(sorted(probe_counts.keys())):
        mismatches, cover_extension = params[2 * i], params[2 * i + 1]

        if mismatches - ic._round_down(mismatches, mismatches_round) < mismatches_eps:
            # Round mismatches down
            mismatches = ic._round_down(mismatches, mismatches_round)
        else:
            # Round mismatches up
            mismatches = ic._round_up(mismatches, mismatches_round)

        if cover_extension - ic._round_down(cover_extension, cover_extension_round) < cover_extension_eps:
            # Round cover_extension down
            cover_extension = ic._round_down(cover_extension, cover_extension_round)
        else:
            # Round cover_extension up
            cover_extension = ic._round_up(cover_extension, cover_extension_round)

        params_rounded += [mismatches, cover_extension]

    total_probe_count = ic._make_total_probe_count_across_datasets_fn(
        probe_counts, interp_fn_type='standard')
    # Verify that the probe count satisfies the constraint
    # Note that this assertion may fail if we are dealing with datasets
    # for which few actual probe counts have been computed; in these
    # cases, the interpolation may severely underestimate the number
    # of probes at a particular parameter choice
    assert total_probe_count(params_rounded) < max_total_count

    # Keep decreasing parameters while satisfying the constraint.
    # In particular, choose to decrease the parameter whose reduction
    # yields the smallest loss while still satisfying the constraint.
    loss_fn = _make_loss_fn(probe_counts, max_total_count, loss_coeffs,
        weights, interp_fn_type='standard')
    while True:
        curr_loss = loss_fn(params_rounded, 0)
        # Find a parameter to decrease
        min_loss, min_loss_new_params = curr_loss, None
        for i in range(len(params_rounded)):
            params_tmp = list(params_rounded)
            if params_tmp[i] == 0:
                # This cannot be decreased
                continue
            if i % 2 == 0:
                # This is a mismatch; decrease by the rounding multiple
                params_tmp[i] -= mismatches_round
            else:
                # This is a cover_extension; decrease by the rounding multiple
                params_tmp[i] -= cover_extension_round
            if total_probe_count(params_tmp) >= max_total_count:
                # This change yields too many probes, so skip it
                continue
            new_loss = loss_fn(params_tmp, 0)
            if new_loss < min_loss:
                min_loss = new_loss
                min_loss_new_params = params_tmp

        if min_loss_new_params != None:
            # There was a change that led to a better loss, so
            # update params_rounded
            params_rounded = min_loss_new_params
        else:
            # No parameter change satisfies the constraint and
            # yields an improvement in the loss
            break

    return params_rounded
Ejemplo n.º 5
0
def _make_loss_fn(probe_counts, max_total_count, coeffs, weights,
        interp_fn_type='standard'):
    """Generate and return a loss function.

    The function calculates a loss over the parameters and adds onto
    that loss to meet a constraint based on the barrier method. It
    uses a logarithmic barrier function to enforce the constraint that
    the total probe count be <= max_total_count.

    The loss over the parameters is:
        sum_{datasets d} (w_d * (sum_{param j} (c_j * (v_{dj})^2)))
    where v_{dj} is the value of the j'th parameter for dataset d,
    c_j is the coefficient for the j'th parameter, and w_d is the
    weight of dataset d

    Args:
        probe_counts: dict giving number of probes required for each
            dataset and choice of parameters
        max_total_count: upper bound on the number of total probes
        coeffs: coefficient in the loss function for each parameter, in
            the same order as the parameters are given
        weights: dict giving weight in the loss function for each dataset
        interp_fn_type: 'standard' (only perform interpolation on mismatches
            and cover_extension parameters) or 'nd' (use scipy's interpolate
            package to interpolate over n-dimensions)

    Returns:
        a function that is the sum of a loss defined over the parameters
        and a value designed to enforce a barrier on the total number
        of probes
    """
    total_probe_count_across_datasets = ic._make_total_probe_count_across_datasets_fn(
        probe_counts, interp_fn_type=interp_fn_type)

    def loss(x, *func_args):
        """
        Compute a loss.

        Let the number of datasets (len(probe_counts)) be N.
        x is a list giving all the parameter values across datasets,
        such that x_i is the (i % N)'th parameter of the (i/N)'th dataset,
        for i=0,1,2,...
        """
        num_datasets = len(probe_counts)
        # The number of parameter values must be a multiple of the number
        # of datasets
        assert len(x) % num_datasets == 0

        num_params = int(len(x) / num_datasets)

        # There must be a coefficient for each parameter
        assert len(coeffs) == num_params

        # First compute a loss over the parameters by taking their L2-norm
        # This is the function we really want to minimize
        opt_val = 0
        for i, dataset in enumerate(sorted(probe_counts.keys())):
            opt_val_dataset = 0
            for j in range(num_params):
                v = x[num_params * i + j]
                opt_val_dataset += coeffs[j] * np.power(v, 2.0)
            opt_val += weights[dataset] * opt_val_dataset

        # We also have the constraint that the total probe count be less than
        # max_total_count
        # We add a barrier function to enforce this constraint and weight the
        # barrier by eps
        eps = func_args[0]
        total_probe_count = total_probe_count_across_datasets(x)
        if np.isnan(total_probe_count):
            # If the interp_fn_type is 'nd' and the parameter values are
            # outside the convex hull of computed points (from probe_counts)
            # for a dataset, scipy's interpolator will be unable to
            # interpolate a probe count and will return nan; here, make
            # the loss (through the barrier) high to reflect this
            logger.warning(("Parameter values being searched are outside "
                "the convex hull of computed points; unable to interpolate "
                "a probe count"))
            barrier_val = 10000000
        elif total_probe_count >= max_total_count:
            # Since the count is beyond the barrier, we should in theory
            # return infinity. But if the optimizer does indeed try parameters
            # that put the probe count here, it would be unable to compute
            # an approximate gradient and may get stuck. So help it out
            # by giving a value such that the negative gradient points toward
            # a direction outside the barrier.
            # Add 1 so that, if total_probe_count == max_total_count, we do
            # not take log(0).
            barrier_val = 9999 + 10000.0 * np.log((total_probe_count -
                                                  max_total_count + 1))
        else:
            # The barrier function is -log(max_total_count - total_probe_count), to
            # enforce the constraint that total_probe_count be less than
            # max_total_count.
            # Add 1 so that, if max_total_count - total_probe_count < 1,
            # the argument to log(..) remains >= 1.
            barrier_val = -1.0 * eps * np.log((max_total_count -
                                               total_probe_count + 1))

        return opt_val + barrier_val

    return loss
Ejemplo n.º 6
0
def higher_dimensional_search(param_names,
                              probe_counts,
                              max_total_count,
                              loss_coeffs=None,
                              dataset_weights=None):
    """Search over multiple arbitrary parameters.

    Unlike the standard search, this can search over any number of
    provided parameters. It interpolates linearly using a function
    from scipy for unstructured data of an arbitrary dimension. Unlike
    the standard search, this does not round values (e.g., they may
    remain fractional even if the parameter is integral).

    Args:
        param_names: tuple giving names of parameters
        probe_counts: dict giving number of probes required for each
            dataset and choice of parameters (the tuple specifying
            a choice of parameter values is ordered such that parameters
            correspond to those in param_names)
        max_total_count: upper bound on the number of total probes
        loss_coeffs: coefficient to use for each parameter in the loss
            function. If not set, default is 1 for each parameter
        dataset_weights: dict giving weight in the loss function for each
            dataset; if not set, default is a weight of 1 for each dataset

    Returns:
        tuple (x, y, z) where:
            x is a dict {dataset: p} where p is a tuple giving optimal
                values for parameters, corresponding to the order in
                param_names
            y is the total number of probes required with the parameters in x
            z is the loss for the parameter values in x
    """
    num_params = len(param_names)

    # Set default values for arguments provided as None
    if loss_coeffs is None:
        # The default coefficient is 1 for each parameter
        logger.warning(("Using a default coefficient for 1 for each parameter "
                        "in the loss function"))
        loss_coeffs = tuple(1.0 for _ in range(num_params))
    else:
        # There must be a coefficient for each parameter
        assert len(loss_coeffs) == num_params
        loss_coeffs = tuple(loss_coeffs)
    if dataset_weights:
        # There should be a weight for each dataset
        for d in probe_counts.keys():
            assert d in dataset_weights
    else:
        dataset_weights = {d: 1.0 for d in probe_counts.keys()}

    # Setup the loss function, parameter bounds, and make an initial guess
    loss_fn = _make_loss_fn(probe_counts,
                            max_total_count,
                            loss_coeffs,
                            dataset_weights,
                            interp_fn_type='nd')
    x0 = _make_initial_guess(probe_counts, None, num_params)

    # Find the optimal parameter values, interpolating probe counts
    # for parameter values between what have been explicitly calculated
    bounds = _make_param_bounds_nd(probe_counts)
    x_sol = _optimize_loss(probe_counts,
                           loss_fn,
                           bounds,
                           x0,
                           interp_fn_type='nd')

    x_sol_dict = {}
    for i, dataset in enumerate(sorted(probe_counts.keys())):
        x_sol_dict[dataset] = tuple(x_sol[num_params * i + j]
                                    for j in range(num_params))

    x_sol_count = ic._make_total_probe_count_across_datasets_fn(
        probe_counts, interp_fn_type='nd')(x_sol)
    x_sol_loss = loss_fn(x_sol, 0)

    # Verify that the probe count satisfies the constraint
    if x_sol_count > max_total_count:
        msg = ("The total probe count based on parameter values found "
               "in the search (%d) exceeds the given limit (%d). This "
               "is likely to happen if the range of the precomputed "
               "parameter values is not as large as it needs to be to "
               "satisfy the constraint. That is, one or more parameter "
               "values may need to be more loose to obtain %d probes. To "
               "fix this, try inputting probe counts for a larger range "
               "(in particular, less stringent choices) of parameter "
               "values. Also, note that the search interpolates probe "
               "counts between precomputed parameter values (%d may be an "
               "interpolated count) and, if the precomputed parameter values "
               "are too sparse (i.e., too few actual probe counts were "
               "input), it may be underestimating the true number of probes "
               "required." %
               (x_sol_count, max_total_count, max_total_count, x_sol_count))
        raise CannotSatisfyProbeCountConstraintError(msg)

    return (x_sol_dict, x_sol_count, x_sol_loss)