def pearson_correlation(self, column_one, column_two): """ Calculates the `Pearson correlation coefficient <http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient>`_ for :code:`column_one` and :code:`column_two`. Returns a number between -1 and 1 with 0 implying no correlation. A correlation close to 1 implies a high positive correlation i.e. as x increases so does y. A correlation close to -1 implies a high negative correlation i.e. as x increases, y decreases. Note: this implementation is borrowed from the MIT licensed `latimes-calculate <https://github.com/datadesk/latimes-calculate/blob/master/calculate/pearson.py>`_. Thanks, LAT! :param column_one: The name of a column. :param column_two: The name of a column. :returns: :class:`decimal.Decimal`. """ x_column = self.columns[column_one] y_column = self.columns[column_two] if x_column.aggregate(agate.HasNulls()): agate.warn_null_calculation(self, x_column) if y_column.aggregate(agate.HasNulls()): agate.warn_null_calculation(self, y_column) x_data = [] y_data = [] for x_val, y_val in zip(x_column, y_column): if x_val is None or y_val is None: continue x_data.append(x_val) y_data.append(y_val) n = len(x_data) sum_x = x_column.aggregate(agate.Sum()) sum_y = y_column.aggregate(agate.Sum()) square = lambda v: pow(v, 2) sum_x_sq = sum(map(square, x_data)) sum_y_sq = sum(map(square, y_data)) product_sum = sum((x_val * y_val for x_val, y_val in zip(x_data, y_data))) pearson_numerator = product_sum - (sum_x * sum_y / n) pearson_denominator = ((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n)).sqrt() if pearson_denominator == 0: return 0 return pearson_numerator / pearson_denominator
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ x_column = table.columns[self._x_column_name] y_column = table.columns[self._y_column_name] if table.aggregate(agate.HasNulls(self._x_column_name)): agate.warn_null_calculation(self, x_column) if table.aggregate(agate.HasNulls(self._y_column_name)): agate.warn_null_calculation(self, y_column) x_data = [] y_data = [] for x_val, y_val in zip(x_column, y_column): if x_val is None or y_val is None: continue x_data.append(x_val) y_data.append(y_val) n = len(x_data) sum_x = table.aggregate(agate.Sum(self._x_column_name)) sum_y = table.aggregate(agate.Sum(self._y_column_name)) square = lambda v: pow(v, 2) sum_x_sq = sum(map(square, x_data)) sum_y_sq = sum(map(square, y_data)) product_sum = sum((x_val * y_val for x_val, y_val in zip(x_data, y_data))) pearson_numerator = product_sum - (sum_x * sum_y / n) pearson_denominator = ((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n)).sqrt() if pearson_denominator == 0: return 0 return pearson_numerator / pearson_denominator
def run(self, table): """ :returns: :class:`decimal.Decimal`. """ column = table.columns[self._column_name] if table.aggregate(agate.HasNulls(self._column_name)): agate.warn_null_calculation(self, column) digits = [int(str(abs(val))[0]) for val in column] percents = [float(digits.count(n))/len(digits) for n in range(1,10)] benfords = [.301, .176, .125, .097, .079, .067, .058, .051, .046] rows = zip(percents, benfords) self.column_names = ['self', 'benford'] self.column_types = [agate.Number(), agate.Number()] table = agate.Table(rows, self.column_names, self.column_types) return table.aggregate(PearsonCorrelation('self', 'benford'))