def find_reference_dispersion(data, k, number_of_bootstraps=10): """ Finds the reference dispersion (and confidence) for the data supplied. """ incremental_statistic = IncrementalStatistic() logging.info( 'Finding {} reference dispersions'.format(number_of_bootstraps)) for run_number in range(number_of_bootstraps): start = datetime.datetime.utcnow() logging.info('At iteration {}'.format(run_number)) # uniform_points = # generate_principal_components_box_uniform_points(data) uniform_points = generate_bounding_box_uniform_points(data) dispersion, _, _ = default_clustering(uniform_points, k, 1, 500) if 0 == dispersion: logging.warning( '[Reference Dispersion] Cannot take the log of 0 for run ' 'number = {}.'.format(run_number)) continue incremental_statistic.add_value(np.log(dispersion)) end = datetime.datetime.utcnow() logging.info('Time for last reference set: {}'.format((end - start))) stddev_dispersions = incremental_statistic.get_standard_deviation() / \ np.sqrt(1 + 1 / number_of_bootstraps) return incremental_statistic.get_mean(), stddev_dispersions