Example #1
0
    def pearson_correlation(self, column_one, column_two):
        """
        Calculates the `Pearson correlation coefficient <http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient>`_
        for :code:`column_one` and :code:`column_two`.

        Returns a number between -1 and 1 with 0 implying no correlation. A
        correlation close to 1 implies a high positive correlation i.e. as x
        increases so does y. A correlation close to -1 implies a high negative
        correlation i.e. as x increases, y decreases.

        Note: this implementation is borrowed from the MIT licensed
        `latimes-calculate <https://github.com/datadesk/latimes-calculate/blob/master/calculate/pearson.py>`_.
        Thanks, LAT!

        :param column_one: The name of a column.
        :param column_two: The name of a column.
        :returns: :class:`decimal.Decimal`.
        """
        x_column = self.columns[column_one]
        y_column = self.columns[column_two]

        if x_column.aggregate(agate.HasNulls()):
            agate.warn_null_calculation(self, x_column)

        if y_column.aggregate(agate.HasNulls()):
            agate.warn_null_calculation(self, y_column)

        x_data = []
        y_data = []

        for x_val, y_val in zip(x_column, y_column):
            if x_val is None or y_val is None:
                continue

            x_data.append(x_val)
            y_data.append(y_val)

        n = len(x_data)

        sum_x = x_column.aggregate(agate.Sum())
        sum_y = y_column.aggregate(agate.Sum())

        square = lambda v: pow(v, 2)
        sum_x_sq = sum(map(square, x_data))
        sum_y_sq = sum(map(square, y_data))

        product_sum = sum((x_val * y_val for x_val, y_val in zip(x_data, y_data)))

        pearson_numerator = product_sum - (sum_x * sum_y / n)
        pearson_denominator = ((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n)).sqrt()

        if pearson_denominator == 0:
            return 0

        return pearson_numerator / pearson_denominator
Example #2
0
    def run(self, table):
        """
        :returns:
            :class:`decimal.Decimal`.
        """
        x_column = table.columns[self._x_column_name]
        y_column = table.columns[self._y_column_name]

        if table.aggregate(agate.HasNulls(self._x_column_name)):
            agate.warn_null_calculation(self, x_column)

        if table.aggregate(agate.HasNulls(self._y_column_name)):
            agate.warn_null_calculation(self, y_column)

        x_data = []
        y_data = []

        for x_val, y_val in zip(x_column, y_column):
            if x_val is None or y_val is None:
                continue

            x_data.append(x_val)
            y_data.append(y_val)

        n = len(x_data)

        sum_x = table.aggregate(agate.Sum(self._x_column_name))
        sum_y = table.aggregate(agate.Sum(self._y_column_name))

        square = lambda v: pow(v, 2)
        sum_x_sq = sum(map(square, x_data))
        sum_y_sq = sum(map(square, y_data))

        product_sum = sum((x_val * y_val for x_val, y_val in zip(x_data, y_data)))

        pearson_numerator = product_sum - (sum_x * sum_y / n)
        pearson_denominator = ((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n)).sqrt()

        if pearson_denominator == 0:
            return 0

        return pearson_numerator / pearson_denominator
Example #3
0
    def run(self, table):
        """
        :returns:
            :class:`decimal.Decimal`.
        """
        column = table.columns[self._column_name]

        if table.aggregate(agate.HasNulls(self._column_name)):
            agate.warn_null_calculation(self, column)
        
        digits = [int(str(abs(val))[0]) for val in column]    
        percents = [float(digits.count(n))/len(digits) for n in range(1,10)]
        benfords = [.301, .176, .125, .097, .079, .067, .058, .051, .046]

        rows = zip(percents, benfords)
        self.column_names = ['self', 'benford']
        self.column_types = [agate.Number(), agate.Number()]
        
        table = agate.Table(rows, self.column_names, self.column_types)

        return table.aggregate(PearsonCorrelation('self', 'benford'))