Beispiel #1
0
    def test_standard_search_vwafr_high_count(self):
        """Integration test with the V-WAfr probe set data."""
        # At 1,000,000 probes, it should be able to use values of 0
        # (most conservative) for both parameters
        ss = param_search.standard_search(self.probe_counts_vwafr, 1000000)
        opt_params, opt_params_count, opt_params_loss = ss

        self.assertLess(opt_params_count, 1000000)

        for dataset, param_vals in opt_params.items():
            mismatches, cover_extension = param_vals
            self.assertEqual(mismatches, 0)
            self.assertEqual(cover_extension, 0)
Beispiel #2
0
    def test_standard_search_vwafr_with_coefficients(self):
        """Integration test with the V-WAfr probe set data.

        This sets the coefficients in the loss function such that
        mismatches has little impact on loss and cover_extension
        dominates. This should drive the mismatches parameter high
        because that will be yield with smaller probe counts.
        """
        # Note that, by default, loss_coeffs is (1.0, 0.01)
        loss_coeffs = (0.01, 1.0)

        # Test the standard search
        ss = param_search.standard_search(self.probe_counts_vwafr,
                                          50000,
                                          loss_coeffs=loss_coeffs)
        opt_params, opt_params_count, opt_params_loss = ss
        self.assertLess(opt_params_count, 50000)
        for dataset, param_vals in opt_params.items():
            mismatches, cover_extension = param_vals
            # Mismatches should be high
            self.assertGreater(mismatches, 5)
Beispiel #3
0
    def test_standard_search_vwafr_with_dataset_weights(self):
        """Integration test with the V-Wafr probe set data.

        This sets the dataset weights in the loss function such that
        all but two datasets have little impact on the loss. The
        other two datasets should dominate the loss and they should
        have small parameter values.
        """
        # Set two relatively diverse datasets (which would generally
        # have high parameter values) to have relatively high weights
        dataset_weights = {d: 1.0 for d in self.probe_counts_vwafr.keys()}
        dataset_weights['hiv1_without_ltr'] = 1000.0
        dataset_weights['hepatitis_c'] = 1000.0

        # Test the standard search
        ss = param_search.standard_search(self.probe_counts_vwafr,
                                          420000,
                                          dataset_weights=dataset_weights)
        opt_params, opt_params_count, opt_params_loss = ss
        self.assertLess(opt_params_count, 420000)

        # Check that both parameter values are small for these diverse
        # datasets
        for d in ['hiv1_without_ltr', 'hepatitis_c']:
            mismatches, cover_extension = opt_params[d]
            self.assertLessEqual(mismatches, 1)
            self.assertLessEqual(cover_extension, 10)

        # Check that at least one other dataset has larger parameter
        # values for each of the 2 parameters
        mismatches_num_large, cover_extension_num_large = 0, 0
        for d in (dataset_weights.keys() -
                  set(['hiv1_without_ltr', 'hepatitis_c'])):
            mismatches, cover_extension = opt_params[d]
            if mismatches > 1:
                mismatches_num_large += 1
            if cover_extension > 10:
                cover_extension_num_large += 1
        self.assertGreater(mismatches_num_large, 0)
        self.assertGreater(cover_extension_num_large, 0)
Beispiel #4
0
 def search_fn(max_total_count):
     return param_search.standard_search(self.probe_counts_vwafr,
                                         max_total_count)
Beispiel #5
0
def main(args):
    # Read the table of probe counts
    param_names, probe_counts = pool_probes_io.read_table_of_probe_counts(
        args.probe_count_tsv)

    if args.dataset_weights_tsv:
        dataset_weights = pool_probes_io.read_table_of_dataset_weights(
            args.dataset_weights_tsv, probe_counts.keys())
    else:
        dataset_weights = None

    # Check that, if loss coefficients were provided, there are the
    # same number of them as parameters
    if args.loss_coeffs and len(args.loss_coeffs) != len(param_names):
        raise Exception(("If using --loss-coeffs, the number of "
            "coefficients (%d) must be the same as the number of "
            "parameters provided in the input table (%d)") %
            (len(args.loss_coeffs), len(param_names)))

    if args.use_nd:
        # This does not round parameters after searching over the
        # dimensional space
        if args.round_params:
            raise Exception(("The arguments '--use-nd' and '--round-params' "
                "cannot both be used; this does not round parameters "
                "after searching over a space with n > 2"))

        # Perform a higher dimensional search for optimal values of
        # the parameters
        s_results = param_search.higher_dimensional_search(
            param_names, probe_counts, args.target_probe_count,
            loss_coeffs=args.loss_coeffs,
            dataset_weights=dataset_weights)
        write_type = 'float'
    else:
        # For the standard search, the only parameters must be (in order):
        #' mismatches' and 'cover_extension'. Verify this.
        if param_names != ('mismatches', 'cover_extension'):
            raise Exception(("For a standard search, the only parameters "
                "in the input table must be, in order: 'mismatches' and "
                "'cover_extension'. Consider using the '--use-nd' argument "
                "to search over additional parameters."))

        # Perform a standard search for optimal values of mismatches and
        # cover extension
        s_results = param_search.standard_search(
            probe_counts, args.target_probe_count,
            round_params=args.round_params,
            loss_coeffs=args.loss_coeffs,
            dataset_weights=dataset_weights)
        write_type = 'int'

    opt_params, opt_params_count, opt_params_loss = s_results

    # Write a table of the optimal parameter values
    pool_probes_io.write_param_values_across_datasets(param_names, opt_params,
        args.param_vals_tsv, type=write_type)

    # Print the total number of probes and loss
    print("Number of probes: %d" % opt_params_count)
    print("Loss: %f" % opt_params_loss)