def Compare(a, b): # This is the comparison function which determines whether the samples # we have from the two changes (a and b) are statistically significant. if a is None or b is None: return None if 'pending' in status_by_change[a] or 'pending' in status_by_change[b]: return compare.PENDING # NOTE: Here we're attempting to scale the provided comparison magnitude # threshold by the larger inter-quartile range (a measure of dispersion, # simply computed as the 75th percentile minus the 25th percentile). The # reason we're doing this is so that we can scale the tolerance # according to the noise inherent in the measurements -- i.e. more noisy # measurements will require a larger difference for us to consider # statistically significant. values_for_a = tuple(itertools.chain(*results_by_change[a])) values_for_b = tuple(itertools.chain(*results_by_change[b])) if not values_for_a: return None if not values_for_b: return None max_iqr = max( math_utils.Iqr(values_for_a), math_utils.Iqr(values_for_b), 0.001) comparison_magnitude = task.payload.get('comparison_magnitude', 1.0) / max_iqr attempts = (len(values_for_a) + len(values_for_b)) // 2 result = compare.Compare(values_for_a, values_for_b, attempts, 'performance', comparison_magnitude) return result.result
def ClusterAndCompare(sequence, partition_point): """Returns the comparison result and the clusters at the partition point.""" # Detect a difference between the two clusters cluster_a, cluster_b = Cluster(sequence, partition_point) if len(cluster_a) > 2 and len(cluster_b) > 2: magnitude = float(math_utils.Iqr(cluster_a) + math_utils.Iqr(cluster_b)) / 2 else: magnitude = 1 return (pinpoint_compare.Compare(cluster_a, cluster_b, (len(cluster_a) + len(cluster_b)) // 2, 'performance', magnitude), cluster_a, cluster_b)
def testIqr(self): self.assertEqual(4, math_utils.Iqr(xrange(8, 0, -1)))
def _Compare(self, change_a, change_b): """Compare the results of two Changes in this Job. Aggregate the exceptions and result_values across every Quest for both Changes. Then, compare all the results for each Quest. If any of them are different, return DIFFERENT. Otherwise, if any of them are inconclusive, return UNKNOWN. Otherwise, they are the SAME. Arguments: change_a: The first Change whose results to compare. change_b: The second Change whose results to compare. Returns: PENDING: If either Change has an incomplete Attempt. DIFFERENT: If the two Changes (very likely) have different results. SAME: If the two Changes (probably) have the same result. UNKNOWN: If we'd like more data to make a decision. """ attempts_a = self._attempts[change_a] attempts_b = self._attempts[change_b] if any(not attempt.completed for attempt in attempts_a + attempts_b): return compare.PENDING attempt_count = (len(attempts_a) + len(attempts_b)) // 2 executions_by_quest_a = _ExecutionsPerQuest(attempts_a) executions_by_quest_b = _ExecutionsPerQuest(attempts_b) any_unknowns = False for quest in self._quests: executions_a = executions_by_quest_a[quest] executions_b = executions_by_quest_b[quest] # Compare exceptions. exceptions_a = tuple( bool(execution.exception) for execution in executions_a) exceptions_b = tuple( bool(execution.exception) for execution in executions_b) if exceptions_a and exceptions_b: if self._comparison_mode == FUNCTIONAL: if getattr(self, '_comparison_magnitude', None): comparison_magnitude = self._comparison_magnitude else: comparison_magnitude = 0.5 else: comparison_magnitude = 1.0 comparison, p_value, low_threshold, high_threshold = compare.Compare( exceptions_a, exceptions_b, attempt_count, FUNCTIONAL, comparison_magnitude) logging.debug('p-value = %.4f (low = %.4f, high = %.4f)', p_value, low_threshold, high_threshold) if comparison == compare.DIFFERENT: return compare.DIFFERENT elif comparison == compare.UNKNOWN: any_unknowns = True # Compare result values by consolidating all measurments by change, and # treating those as a single sample set for comparison. def AllValues(execution): for e in execution: if not e.result_values: continue for v in e.result_values: yield v all_a_values = tuple(AllValues(executions_a)) all_b_values = tuple(AllValues(executions_b)) if all_a_values and all_b_values: if getattr(self, '_comparison_magnitude', None): max_iqr = max( max(math_utils.Iqr(all_a_values), math_utils.Iqr(all_b_values)), 0.001) comparison_magnitude = abs(self._comparison_magnitude / max_iqr) else: comparison_magnitude = 1.0 sample_count = (len(all_a_values) + len(all_b_values)) // 2 comparison, p_value, low_threshold, high_threshold = compare.Compare( all_a_values, all_b_values, sample_count, PERFORMANCE, comparison_magnitude) logging.debug('p-value = %.4f (low = %.4f, high = %.4f)', p_value, low_threshold, high_threshold) if comparison == compare.DIFFERENT: return compare.DIFFERENT elif comparison == compare.UNKNOWN: any_unknowns = True if any_unknowns: return compare.UNKNOWN return compare.SAME
def _Compare(self, change_a, change_b): """Compare the results of two Changes in this Job. Aggregate the exceptions and result_values across every Quest for both Changes. Then, compare all the results for each Quest. If any of them are different, return DIFFERENT. Otherwise, if any of them are inconclusive, return UNKNOWN. Otherwise, they are the SAME. Arguments: change_a: The first Change whose results to compare. change_b: The second Change whose results to compare. Returns: PENDING: If either Change has an incomplete Attempt. DIFFERENT: If the two Changes (very likely) have different results. SAME: If the two Changes (probably) have the same result. UNKNOWN: If we'd like more data to make a decision. """ attempts_a = self._attempts[change_a] attempts_b = self._attempts[change_b] if any(not attempt.completed for attempt in attempts_a + attempts_b): return compare.PENDING attempt_count = (len(attempts_a) + len(attempts_b)) / 2 executions_by_quest_a = _ExecutionsPerQuest(attempts_a) executions_by_quest_b = _ExecutionsPerQuest(attempts_b) any_unknowns = False for quest in self._quests: executions_a = executions_by_quest_a[quest] executions_b = executions_by_quest_b[quest] # Compare exceptions. values_a = tuple( bool(execution.exception) for execution in executions_a) values_b = tuple( bool(execution.exception) for execution in executions_b) if values_a and values_b: if self._comparison_mode == FUNCTIONAL: if (hasattr(self, '_comparison_magnitude') and self._comparison_magnitude): comparison_magnitude = self._comparison_magnitude else: comparison_magnitude = 0.5 else: comparison_magnitude = 1.0 comparison = compare.Compare(values_a, values_b, attempt_count, FUNCTIONAL, comparison_magnitude) if comparison == compare.DIFFERENT: return compare.DIFFERENT elif comparison == compare.UNKNOWN: any_unknowns = True # Compare result values. values_a = tuple( _Mean(execution.result_values) for execution in executions_a if execution.result_values) values_b = tuple( _Mean(execution.result_values) for execution in executions_b if execution.result_values) if values_a and values_b: if (hasattr(self, '_comparison_magnitude') and self._comparison_magnitude): max_iqr = max(math_utils.Iqr(values_a), math_utils.Iqr(values_b)) if max_iqr: comparison_magnitude = abs(self._comparison_magnitude / max_iqr) else: comparison_magnitude = 1000 # Something very large. else: comparison_magnitude = 1.0 comparison = compare.Compare(values_a, values_b, attempt_count, PERFORMANCE, comparison_magnitude) if comparison == compare.DIFFERENT: return compare.DIFFERENT elif comparison == compare.UNKNOWN: any_unknowns = True if any_unknowns: return compare.UNKNOWN return compare.SAME