Beispiel #1
0
def covariance(x, y):
    """Return the sample covariance of x and y."""
    n = len(x)
    if len(y) != n:
        raise statistics.StatisticsError('covariance requires that both inputs '
                                         'have same number of data points')

    if n < 2:
        raise statistics.StatisticsError('covariance requires at least two data points')

    sectype = type(x[0])  # all elts of x assumed of same type
    if not issubclass(sectype, SecureObject):
        if sys.version_info.minor >= 10:
            return statistics.covariance(x, y)

        # inline code of statistics.covariance() copied from Python 3.10.0:
        xbar = fsum(x) / n
        ybar = fsum(y) / n
        sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
        return sxy / (n - 1)

    if issubclass(sectype, SecureFixedPoint):
        xbar = runtime.sum(x) / n
        ybar = runtime.sum(y) / n
        sxy = runtime.in_prod([xi - xbar for xi in x], [yi - ybar for yi in y])
        return sxy / (n - 1)

    if issubclass(sectype, SecureInteger):
        sx = runtime.sum(x)
        sy = runtime.sum(y)
        sxy = runtime.in_prod([xi * n - sx for xi in x], [yi * n - sy for yi in y])
        d = n**2 * (n - 1)
        return (sxy + d//2) // d

    raise TypeError('secure fixed-point or integer type required')
def _correlation(x, y) -> float:
    """Pearson's correlation coefficient
    Return the Pearson's correlation coefficient for two inputs. Pearson's
    correlation coefficient *r* takes values between -1 and +1. It measures the
    strength and direction of the linear relationship, where +1 means very
    strong, positive linear relationship, -1 very strong, negative linear
    relationship, and 0 no linear relationship
    """
    n = len(x)
    if len(y) != n:
        raise statistics.StatisticsError(
            "correlation requires that both inputs have same number of data points"
        )
    if n < 2:
        raise statistics.StatisticsError(
            "correlation requires at least two data points")
    xbar = fsum(x) / n
    ybar = fsum(y) / n
    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
    sxx = fsum((xi - xbar)**2.0 for xi in x)
    syy = fsum((yi - ybar)**2.0 for yi in y)
    try:
        return sxy / sqrt(sxx * syy)
    except ZeroDivisionError:
        raise statistics.StatisticsError(
            "at least one of the inputs is constant")
Beispiel #3
0
def linear_regression(x, y):
    """Return a (simple) linear regression model for x and y.

    The parameters of the model are returned as a named LinearRegression tuple,
    with two fields called "slope" and "intercept", respectively.

    A linear regression model describes the relationship between independent
    variable x and dependent variable y in terms of a linear function:

        y = slope * x + intercept + noise

    Here, slope and intercept are the regression parameters estimated using
    ordinary least squares, and noise represents the variability of the data
    not explained by the linear regression (it is equal to the difference
    between predicted and actual values of the dependent variable).
    """
    n = len(x)
    if len(y) != n:
        raise statistics.StatisticsError(
            'covariance requires that both inputs '
            'have same number of data points')

    if n < 2:
        raise statistics.StatisticsError(
            'covariance requires at least two data points')

    sectype = type(x[0])  # all elts of x assumed of same type
    if not issubclass(sectype, SecureObject):
        if sys.version_info.minor >= 10:
            return statistics.linear_regression(x, y)

        # inline code of statistics.linear_regression() adapted from Python 3.10.0:
        xbar = fsum(x) / n
        ybar = fsum(y) / n
        sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
        sxx = fsum((xi - xbar)**2.0 for xi in x)
        try:
            slope = sxy / sxx  # equivalent to:  covariance(x, y) / variance(x)
        except ZeroDivisionError:
            raise statistics.StatisticsError('x is constant') from None

        intercept = ybar - slope * xbar
        return LinearRegression(slope=slope, intercept=intercept)

    if issubclass(sectype, SecureFixedPoint):
        xbar = runtime.sum(x) / n
        ybar = runtime.sum(y) / n
        xxbar = [xi - xbar for xi in x]
        yybar = [yi - ybar for yi in y]
        sxy = runtime.in_prod(xxbar, yybar)
        sxx = runtime.in_prod(xxbar, xxbar)
        slope = sxy / sxx
        intercept = ybar - slope * xbar
        return LinearRegression(slope=slope, intercept=intercept)

    # TODO: implement for secure integers as well
    raise TypeError('secure fixed-point type required')
Beispiel #4
0
def mode(data):
    """Return the mode, the most common data point from discrete or nominal data.

    If there are multiple modes with the same frequency, the first one encountered
    in data is returned.

    If data is empty, StatisticsError is raised.

    To speed up the computation, the bit length of the sample range max(data) - min(data)
    is revealed, provided this range is not too small.
    """
    if iter(data) is data:
        x = list(data)
    else:
        x = data[:]
    n = len(x)
    if not n:
        raise statistics.StatisticsError(
            'mode requires at least one data point')

    if isinstance(x[0], sectypes.SecureObject):
        return _mode(x, PRIV=runtime.options.sec_param // 6)

    return statistics.mode(
        x)  # NB: raises StatisticsError in Python < 3.8 if x is multimodal
Beispiel #5
0
def mean(data):
    """Return the sample mean (average) of data which can be a sequence or an iterable.

    If the data points are secure integers or secure fixed-point numbers, the mean
    value returned is of the same secure type, rounded to the nearest number.

    If data is empty, StatisticsError will be raised.
    """
    if iter(data) is data:
        x = list(data)
    else:
        x = data
    n = len(x)
    if not n:
        raise statistics.StatisticsError(
            'mean requires at least one data point')

    stype = type(x[0])  # all elts of x assumed of same type
    if issubclass(stype, sectypes.SecureFiniteField):
        raise TypeError('secure fixed-point or integer type required')

    if issubclass(stype, sectypes.SecureInteger):
        s = runtime.sum(x)
        return (s + n // 2) // n  # round to nearest integer

    if issubclass(stype, sectypes.SecureFixedPoint):
        s = runtime.sum(x)
        e = n.bit_length() - 1  # 1/2 < 2**e / n <= 1
        return s * (2**e / n) * 2**-e

    return statistics.mean(x)
Beispiel #6
0
def _med(data, med=None):
    if iter(data) is data:
        x = list(data)
    else:
        x = data[:]
    n = len(x)
    if not n:
        raise statistics.StatisticsError(
            'median requires at least one data point')

    stype = type(x[0])  # all elts of x assumed of same type
    if issubclass(stype, sectypes.SecureFiniteField):
        raise TypeError('secure fixed-point or integer type required')

    if not issubclass(stype, sectypes.SecureObject):
        return statistics.median(x)

    if n % 2:
        return _quickselect(x, (n - 1) / 2)

    if med == 'low':
        return _quickselect(x, (n - 2) / 2)

    if med == 'high':
        return _quickselect(x, n / 2)

    # average two middle values
    s = _quickselect(x, (n - 2) / 2) + _quickselect(x, n / 2)

    if issubclass(stype, sectypes.SecureInteger):
        return s // 2

    return s / 2
Beispiel #7
0
def _std(data, m, correction):
    if iter(data) is data:
        x = list(data)
    else:
        x = data
    n = len(x)
    if n < 1 + correction:
        if correction:
            e = 'stdev requires at least two data points'
        else:
            e = 'pstdev requires at least one data point'
        raise statistics.StatisticsError(e)

    stype = type(x[0])  # all elts of x assumed of same type
    if issubclass(stype, sectypes.SecureFiniteField):
        raise TypeError('secure fixed-point or integer type required')

    if issubclass(stype, sectypes.SecureInteger):
        return _isqrt(_var(x, m, correction))

    if issubclass(stype, sectypes.SecureFixedPoint):
        return _fsqrt(_var(x, m, correction))

    if correction:
        return statistics.stdev(x, m)

    return statistics.pstdev(x, m)
Beispiel #8
0
def correlation(x, y):
    """Return Pearson's correlation coefficient for x and y.

    Pearson's correlation coefficient takes values between -1 and +1.
    It measures the strength and direction of the linear relationship
    between x and y, where +1 means very strong, positive linear relationship,
    -1 very strong, negative linear relationship, and 0 no linear relationship.
    """
    n = len(x)
    if len(y) != n:
        raise statistics.StatisticsError(
            'covariance requires that both inputs '
            'have same number of data points')

    if n < 2:
        raise statistics.StatisticsError(
            'covariance requires at least two data points')

    sectype = type(x[0])  # all elts of x assumed of same type
    if not issubclass(sectype, SecureObject):
        if sys.version_info.minor >= 10:
            return statistics.correlation(x, y)

        # inline code of statistics.correlation() copied from Python 3.10.0:
        xbar = fsum(x) / n
        ybar = fsum(y) / n
        sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
        sxx = fsum((xi - xbar)**2.0 for xi in x)
        syy = fsum((yi - ybar)**2.0 for yi in y)
        try:
            return sxy / sqrt(sxx * syy)

        except ZeroDivisionError:
            raise statistics.StatisticsError(
                'at least one of the inputs is constant') from None

    if issubclass(sectype, SecureFixedPoint):
        xbar = runtime.sum(x) / n
        ybar = runtime.sum(y) / n
        xxbar = [xi - xbar for xi in x]
        yybar = [yi - ybar for yi in y]
        sxy = runtime.in_prod(xxbar, yybar)
        sxx = runtime.in_prod(xxbar, xxbar)
        syy = runtime.in_prod(yybar, yybar)
        return sxy / (_fsqrt(sxx) * _fsqrt(syy))

    raise TypeError('secure fixed-point type required')
Beispiel #9
0
def _presorted_median(data):
    n = len(data)
    if n == 0:
        raise statistics.StatisticsError('no median for empty data')
    if n % 2 == 1:
        return data[n // 2]
    else:
        i = n // 2
        return (data[i - 1] + data[i]) / 2
Beispiel #10
0
def singlemode(data):
    try:
        # New in Python 3.8
        modes = statistics.multimode(data)
    except AttributeError:
        return statistics.mode(data)
    else:
        if len(modes) > 1:
            raise statistics.StatisticsError('no unique mode')
        else:
            return modes[0]
def _quantiles(data, *, n=4, method="exclusive") -> List:
    """Divide *data* into *n* continuous intervals with equal probability.
    Returns a list of (n - 1) cut points separating the intervals.
    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
    Set *n* to 100 for percentiles which gives the 99 cuts points that
    separate *data* in to 100 equal sized groups.
    The *data* can be any iterable containing sample.
    The cut points are linearly interpolated between data points.
    If *method* is set to *inclusive*, *data* is treated as population
    data.  The minimum value is treated as the 0th percentile and the
    maximum value is treated as the 100th percentile.
    """
    if n < 1:
        raise statistics.StatisticsError("n must be at least 1")
    data = sorted(data)
    ld = len(data)
    if ld < 2:
        raise statistics.StatisticsError("must have at least two data points")
    if method == "inclusive":
        m = ld - 1
        result = []
        for i in range(1, n):
            j, delta = divmod(i * m, n)
            interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
            result.append(interpolated)
        return result
    if method == "exclusive":
        m = ld + 1
        result = []
        for i in range(1, n):
            j = i * m // n  # rescale i to m/n
            j = 1 if j < 1 else ld - 1 if j > ld - 1 else j  # clamp to 1 .. ld-1
            delta = i * m - j * n  # exact integer math
            interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
            result.append(interpolated)
        return result
    raise ValueError(f"Unknown method: {method!r}")
Beispiel #12
0
    def get_median(self, full_tree: bool = True, sub_tree=None) -> float:
        """
        Calculates median of elements in a tree. Each value in a tree is used for calculation only once.
        Args:
            full_tree (bool): if True  self.root tree is used for calculation, else tree specified by sub_tree
            sub_tree (Node|None): subtree to use for calculation if full_tree is False

        Returns:
            Median of elements in a tree

        Raises:
            statistics.StatisticsError: when tree has no elements(i.e. is None) or has a single element that has
            value None
        """
        if full_tree:
            sub_tree = self.root
        try:
            return statistics.median(self._get_subtree_values(sub_tree))
        except statistics.StatisticsError:
            raise statistics.StatisticsError(
                'Cannot calculate median from an empty Tree')
Beispiel #13
0
def _var(data, m, correction):
    if iter(data) is data:
        x = list(data)
    else:
        x = data
    n = len(x)
    if n < 1 + correction:
        if correction:
            e = 'variance requires at least two data points'
        else:
            e = 'pvariance requires at least one data point'
        raise statistics.StatisticsError(e)

    stype = type(x[0])  # all elts of x assumed of same type
    if issubclass(stype, sectypes.SecureFiniteField):
        raise TypeError('secure fixed-point or integer type required')

    if issubclass(stype, sectypes.SecureInteger):
        if m is None:
            s = runtime.sum(x)
            y = [a * n - s for a in x
                 ]  # TODO: runtime.scalar_mul(n,x) for public (int) n
            d = n**2 * (n - correction)
        else:
            y = runtime.vector_sub(
                x, [m] * n)  # TODO: runtime.vector_sub(x,y) for scalar y
            d = n - correction
        return (runtime.in_prod(y, y) + d // 2) // d

    if issubclass(stype, sectypes.SecureFixedPoint):
        if m is None:
            m = mean(x)
        y = runtime.vector_sub(x, [m] * n)
        d = n - correction
        return runtime.in_prod(y, y) / d

    if correction:
        return statistics.variance(x, m)

    return statistics.pvariance(x, m)
def doCalc(G, dyna, do, counter, results):
    pr = nx.pagerank(G, alpha=0.85)
    avg = float(1) / float(number_of_nodes(G))
    isavg = 0
    underavg = 0
    aboveavg = 0
    minv = 1
    maxv = 0
    work = 0
    for key, value in pr.items():
        #print(key, 'corresponds to', value)
        g = str(value)
        a = str(key) + ": " + g
        #print(a)
        work = work + value
        if (value < avg):
            underavg = underavg + 1
            if (minv > value):
                minv = value
        elif (value == avg):
            isavg = isavg + 1
        else:
            aboveavg = aboveavg + 1
            if (maxv < value):
                maxv = value

    onemin = 0
    onemax = 0
    items = []
    for k, l in pr.items():
        items.append(l)
        if (l == minv):
            onemin = onemin + 1
        elif (l == maxv):
            onemax = onemax + 1

    m = str(onemin)
    n = str(onemax)
    b = str(work)
    c = str(isavg)
    d = str(underavg)
    e = str(aboveavg)
    h = str(minv)
    i = str(maxv)
    f = str(avg)
    o = str(statistics.median(items))
    p = str(statistics.median_low(items))
    q = str(statistics.median_high(items))
    s = str(statistics.median_grouped(items))
    t = str(statistics.mode(items))
    u = str(statistics.pstdev(items))
    v = str(statistics.pvariance(items))
    w = str(statistics.stdev(items))
    x = str(statistics.variance(items))
    y = str(statistics.StatisticsError(items))
    if (os.stat(os.getcwd() + "/results/testResults" + str(counter) +
                ".txt").st_size >= 500000):
        results.close()
        counter = counter + 1
        results = open(
            os.getcwd() + "/results/testResults" + str(counter) + ".txt", 'w')
        print(counter)
    results.write('\n')
    results.write('testdata without ' + dyna + ' and ' + do + '\n')
    results.write(json.dumps(pr) + '\n')
    results.write("average: " + f + '\n')
    results.write("total: " + b + '\n')
    results.write("isavg: " + c + '\n')
    results.write("underavg: " + d + '\n')
    results.write("aboveavg: " + e + '\n')
    results.write("minimum: " + h + '\n')
    results.write("maximum: " + i + '\n')
    results.write("oneminimum: " + m + '\n')
    results.write("onemaximum: " + n + '\n')
    results.write("median: " + o + '\n')
    results.write("low median: " + p + '\n')
    results.write("high median: " + q + '\n')
    results.write("grouped median: " + s + '\n')
    results.write("mode: " + t + '\n')
    results.write("pstdev: " + u + '\n')
    results.write("pvariance: " + v + '\n')
    results.write("stdev: " + w + '\n')
    results.write("variance: " + x + '\n')
    results.write("error: " + y + '\n')  # too much data, only if actually test
    print("succes writing to test " + str(counter) + " result")
    return counter, results
Beispiel #15
0
def quantiles(data, *, n=4, method='exclusive'):
    """Divide data into n continuous intervals with equal probability.

    Returns a list of n-1 cut points separating the intervals.

    Set n to 4 for quartiles (the default). Set n to 10 for deciles.
    Set n to 100 for percentiles which gives the 99 cuts points that
    separate data into 100 equal sized groups.

    The data can be any iterable containing samples.
    The cut points are linearly interpolated between data points.

    If method is set to 'inclusive', data is treated as population data.
    The minimum value is treated as the 0th percentile (lowest quantile) and
    the maximum value is treated as the 100th percentile (highest quantile).
    """
    if n < 1:
        raise statistics.StatisticsError('n must be at least 1')

    if iter(data) is data:
        x = list(data)
    else:
        x = data
    ld = len(x)
    if ld < 2:
        raise statistics.StatisticsError('must have at least two data points')

    sectype = type(x[0])  # all elts of x assumed of same type
    if not issubclass(sectype, SecureObject):
        return statistics.quantiles(x, n=n, method=method)

    if issubclass(sectype, SecureFixedPoint):
        div_n = lambda a: a / n
    elif issubclass(sectype, SecureInteger):
        div_n = lambda a: (a + n//2) // n
    else:
        raise TypeError('secure fixed-point or integer type required')

    if method == 'inclusive':
        m = ld - 1
        # Determine which kth order statistics will actually be used.
        data = {}
        for i in range(1, n):
            j, delta = divmod(i * m, n)
            data[j] = None
            if delta:
                data[j+1] = None
        points = _quickselect(x, list(data))
        data = dict(zip(data, points))

        # Compute the n-1 cut points for the n quantiles.
        result = []
        for i in range(1, n):
            j, delta = divmod(i * m, n)
            interpolated = data[j]
            if delta:
                interpolated += div_n((data[j+1] - data[j]) * delta)
            result.append(interpolated)
        return result

    if method == 'exclusive':
        m = ld + 1
        # Determine which kth order statistics will actually be used.
        data = {}
        for i in range(1, n):
            j = i * m // n
            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
            delta = i*m - j*n
            if n - delta:
                data[j-1] = None
            if delta:
                data[j] = None
        points = _quickselect(x, list(data))
        data = dict(zip(data, points))

        # Compute the n-1 cut points for the n quantiles.
        result = []
        for i in range(1, n):
            j = i * m // n
            j = 1 if j < 1 else ld-1 if j > ld-1 else j
            delta = i*m - j*n
            if delta == 0:
                interpolated = data[j-1]
            elif delta == n:
                interpolated = data[j]
            else:  # NB: possibly delta<0 or delta>n
                interpolated = data[j-1] + div_n((data[j] - data[j-1]) * delta)
            result.append(interpolated)
        return result

    raise ValueError(f'Unknown method: {method!r}')