Example #1
0
def outlierDetection(samples_x, samples_y_aggregation):
    '''
    '''
    outliers = []
    for samples_idx in range(0, len(samples_x)):
        #sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
        #  \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
        diagnostic_regressor_gp = gp_create_model.createModel(\
                                        samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\
                                        samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:])
        mu, sigma = gp_prediction.predict(samples_x[samples_idx],
                                          diagnostic_regressor_gp['model'])
        # 2.33 is the z-score for 98% confidence level
        if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma):
            outliers.append({
                "samples_idx":
                samples_idx,
                "expected_mu":
                mu,
                "expected_sigma":
                sigma,
                "difference":
                abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)
            })

    outliers = None if len(outliers) == 0 else outliers
    return outliers
Example #2
0
def _outlierDetection_threaded(inputs):
    '''
    Detect the outlier
    '''
    [samples_idx, samples_x, samples_y_aggregation] = inputs
    sys.stderr.write("[%s] DEBUG: Evaluating %dth of %d samples\n"\
                        % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
    outlier = None

    # Create a diagnostic regression model which removes the sample that we want to evaluate
    diagnostic_regressor_gp = gp_create_model.createModel(\
                                    samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\
                                    samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:])
    mu, sigma = gp_prediction.predict(samples_x[samples_idx],
                                      diagnostic_regressor_gp['model'])

    # 2.33 is the z-score for 98% confidence level
    if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma):
        outlier = {
            "samples_idx":
            samples_idx,
            "expected_mu":
            mu,
            "expected_sigma":
            sigma,
            "difference":
            abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)
        }
    return outlier