Example #1
0
 def test_pearson(self):
     students = [
         dict(sat=1200, gpa=3.6, drinks_per_day=0.3),
         dict(sat=1400, gpa=3.9, drinks_per_day=0.1),
         dict(sat=1100, gpa=3.0, drinks_per_day=0.5),
         dict(sat=800, gpa=2.5, drinks_per_day=2.0),
     ]
     self.assertEqual(
         calculate.pearson(
             [i.get("sat") for i in students],
             [i.get("gpa") for i in students],
         ), 0.9714441330841945)
     self.assertEqual(
         calculate.pearson(
             [i.get("sat") for i in students],
             [i.get("drinks_per_day") for i in students],
         ), -0.9435297685685435)
     self.assertRaises(ValueError, calculate.pearson, [1], [1, 2, 3])
Example #2
0
 def test_pearson(self):
     students = [
         dict(sat=1200, gpa=3.6, drinks_per_day=0.3),
         dict(sat=1400, gpa=3.9, drinks_per_day=0.1),
         dict(sat=1100, gpa=3.0, drinks_per_day=0.5),
         dict(sat=800, gpa=2.5,  drinks_per_day=2.0),
     ]
     self.assertEqual(
         calculate.pearson(
             [i.get("sat") for i in students],
             [i.get("gpa") for i in students],
         ),
         0.9714441330841945
     )
     self.assertEqual(
         calculate.pearson(
             [i.get("sat") for i in students],
             [i.get("drinks_per_day") for i in students],
         ),
         -0.9435297685685435
     )
     self.assertRaises(ValueError, calculate.pearson, [1], [1, 2, 3])
def pearsons_karma():
    stories = get_top_stories_with_user_karma()
    user_karma = [Decimal(s.get('user_karma')) for s in stories]
    story_karma = [Decimal(s.get('score')) for s in stories]
    return calculate.pearson(user_karma, story_karma)
def benfords_law(number_list, method="first_digit", verbose=True):
    """
    Accepts a list of numbers and applies a quick-and-dirty run against Benford's Law.

    Benford's Law makes statements about the occurance of leading digits in a dataset.
    It claims that a leading digit of 1 will occur about 30 percent of the time,
    and each number after it a little bit less, with the number 9 occuring the least.
    
    Datasets that greatly vary from the law are sometimes suspected of fraud. 
    
    The function returns the Pearson correlation coefficient, also known as Pearson's r, 
    which reports how closely the two datasets are related.
    
    This function also includes a variation on the classic Benford analysis popularized 
    by blogger Nate Silver, who conducted an analysis of the final digits of polling
    data. To use Silver's variation, provide the keyward argument `method` with the 
    value 'last_digit'.
    
    To prevent the function from printing, set the optional keyword argument `verbose` to False.
    
    This function is based upon code from a variety of sources around the web, but
    owes a particular debt to the work of Christian S. Perone.
    
    h3. Example usage
    
        >> import calculate
        >> calculate.benfords_law([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        BENFORD'S LAW: FIRST_DIGIT

        Pearson's R: 0.86412304649

        | Number | Count | Expected Percentage | Actual Percentage |
        ------------------------------------------------------------
        | 1      | 2     | 30.1029995664       | 20.0              |
        | 2      | 1     | 17.6091259056       | 10.0              |
        | 3      | 1     | 12.4938736608       | 10.0              |
        | 4      | 1     | 9.69100130081       | 10.0              |
        | 5      | 1     | 7.91812460476       | 10.0              |
        | 6      | 1     | 6.69467896306       | 10.0              |
        | 7      | 1     | 5.79919469777       | 10.0              |
        | 8      | 1     | 5.11525224474       | 10.0              |
        | 9      | 1     | 4.57574905607       | 10.0              |
        
        >> calculate.benfords_law([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], verbose=False)
        -0.863801937698704
    
    h3. A Warning

    Not all datasets should be expected to conform to Benford's rules. 
    I lifted the following guidance from an academic paper linked 
    below.

    Durtschi, Hillison, and Pacini (2004) said Benford "compliance"
    should be expected in the following circumstances:

        1. Numbers that result from mathematical combination of numbers

        2. Transaction-level data (e.g., disbursements, sales) 

        3. Large datasets 

        4. Mean is greater than median and skew is positive 

    And not to expect Benford distributions when:

        1. Numbers are assigned (e.g., check numbers, invoice numbers) 
        
        2. Numbers influence by human thought (e.g. $1.99)
        
        3. Accounts with a large number of firm-specific numbers 

        4. Accounts with a built-in minimum or maximum 

        5. Where no transaction is recorded.
    
    h3. Sources
    
        "Benford's Law":http://en.wikipedia.org/wiki/Benford%27s_law
        "Applying Benford's Law to CAR":http://www.chasedavis.com/2008/sep/28/applying-benfords-law-car/
        "Breaking the (Benford) Law: Statistical Fraud Detection in Campaign Finance (pdf)":http://cho.pol.uiuc.edu/wendy/papers/tas.pdf
        "Benford's Law meets Python and Apple Stock Prices":http://pyevolve.sourceforge.net/wordpress/?p=457
        "Strategic Vision Polls Exhibit Unusual Patterns, Possibly Indicating Fraud":http://www.fivethirtyeight.com/2009/09/strategic-vision-polls-exhibit-unusual.html
        "Nate Silver: pollster may be fraud":http://blogs.tampabay.com/buzz/2009/09/nate-silver-pollster-may-be-fraud.html
        
    
    """
    # Select the appropriate retrieval method
    if method not in ["last_digit", "first_digit"]:
        raise ValueError("The method you've requested is not included in this function.")

    def _get_first_digit(number):
        return int(str(number)[0])

    def _get_last_digit(number):
        return int(str(number)[-1])

    method_name = "_get_%s" % method
    method_obj = locals()[method_name]

    # Set the typical distributions we expect to find
    typical_distributions = {"first_digit": {}, "last_digit": {}}
    for number in xrange(1, 10):
        log10 = math.log10(1 + 1 / float(number)) * 100.0
        typical_distributions["first_digit"].update({number: log10})

    typical_distributions["last_digit"].update(
        {0: 10.0, 1: 10.0, 2: 10.0, 3: 10.0, 4: 10.0, 5: 10.0, 6: 10.0, 7: 10.0, 8: 10.0, 9: 10.0}
    )

    # Fetch the digits we want to analyze
    digit_list = []
    for number in number_list:
        digit = method_obj(number)
        digit_list.append(digit)

    # Loop through the data set and grab all the applicable numbers
    results = []
    for number in xrange(0, 10):
        count = digit_list.count(number)
        try:
            expected_percentage = typical_distributions[method][number]
        except KeyError:
            continue
        actual_percentage = count / float(len(digit_list)) * 100.0
        results.append([number, count, expected_percentage, actual_percentage])

    # Run the two percentage figures through
    # Pearson's correlation coefficient to
    # see how closely related they are.
    list_one = [i[2] for i in results]
    list_two = typical_distributions[method]
    pearsons_r = calculate.pearson(list_one, list_two)

    # If the user has asked for verbosity,
    # print out this cutsey table with all
    # of the data.
    if verbose:
        from calculate import ptable

        # Convert results to strings
        results = [map(str, i) for i in results]
        # Print everything out using our pretty table module
        labels = ["Number", "Count", "Expected Percentage", "Actual Percentage"]
        print "BENFORD'S LAW: %s" % method.upper().replace("_", " ")
        print ""
        print "Pearson's r: %s" % (pearsons_r)
        print ""
        print ptable.indent([labels] + results, hasHeader=True, separateRows=False, prefix="| ", postfix=" |")

    return pearsons_r
def pearsons_karma():
    stories = get_top_stories_with_user_karma()
    user_karma = [Decimal(s.get('user_karma')) for s in stories]
    story_karma = [Decimal(s.get('score')) for s in stories]
    return calculate.pearson(user_karma, story_karma)
def benfords_law(number_list, method='first_digit', verbose=True):
    """
    Accepts a list of numbers and applies a quick-and-dirty run
    against Benford's Law.

    Benford's Law makes statements about the occurance of leading digits in a
    dataset. It claims that a leading digit of 1 will occur about 30 percent
    of the time, and each number after it a little bit less, with the number
    9 occuring the least.

    Datasets that greatly vary from the law are sometimes suspected of fraud.

    The function returns the Pearson correlation coefficient, also known as
    Pearson's r,  which reports how closely the two datasets are related.

    This function also includes a variation on the classic Benford analysis
    popularized by blogger Nate Silver, who conducted an analysis of the final
    digits of polling data. To use Silver's variation, provide the keyward
    argument `method` with the value 'last_digit'.

    To prevent the function from printing, set the optional keyword argument
    `verbose` to False.

    This function is based upon code from a variety of sources around the web,
    but owes a particular debt to the work of Christian S. Perone.

    h3. Example usage

        >> import calculate
        >> calculate.benfords_law([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        BENFORD'S LAW: FIRST_DIGIT

        Pearson's R: 0.86412304649

        | Number | Count | Expected Percentage | Actual Percentage |
        ------------------------------------------------------------
        | 1      | 2     | 30.1029995664       | 20.0              |
        | 2      | 1     | 17.6091259056       | 10.0              |
        | 3      | 1     | 12.4938736608       | 10.0              |
        | 4      | 1     | 9.69100130081       | 10.0              |
        | 5      | 1     | 7.91812460476       | 10.0              |
        | 6      | 1     | 6.69467896306       | 10.0              |
        | 7      | 1     | 5.79919469777       | 10.0              |
        | 8      | 1     | 5.11525224474       | 10.0              |
        | 9      | 1     | 4.57574905607       | 10.0              |

        >> calculate.benfords_law([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            verbose=False)
        -0.863801937698704

    h3. A Warning

    Not all datasets should be expected to conform to Benford's rules.
    I lifted the following guidance from an academic paper linked
    below.

    Durtschi, Hillison, and Pacini (2004) said Benford "compliance"
    should be expected in the following circumstances:

        1. Numbers that result from mathematical combination of numbers

        2. Transaction-level data (e.g., disbursements, sales)

        3. Large datasets

        4. Mean is greater than median and skew is positive

    And not to expect Benford distributions when:

        1. Numbers are assigned (e.g., check numbers, invoice numbers)

        2. Numbers influence by human thought (e.g. $1.99)

        3. Accounts with a large number of firm-specific numbers

        4. Accounts with a built-in minimum or maximum

        5. Where no transaction is recorded.

    h3. Sources

        "Benford's Law":http://en.wikipedia.org/wiki/Benford%27s_law
        "Applying Benford's Law to CAR":http://www.chasedavis.com/2008/sep/\
28/applying-benfords-law-car/
        "Breaking the (Benford) Law: Statistical Fraud Detection in Campaign \
Finance (pdf)":http://cho.pol.uiuc.edu/wendy/papers/tas.pdf
        "Benford's Law meets Python and Apple Stock Prices":http://pyevolve.\
sourceforge.net/wordpress/?p=457
        "Strategic Vision Polls Exhibit Unusual Patterns, Possibly Indicating \
Fraud":http://www.fivethirtyeight.com/2009/09/strategic-vision-polls-\
exhibit-unusual.html
        "Nate Silver: pollster may be fraud":http://blogs.tampabay.com/buzz/\
2009/09/nate-silver-pollster-may-be-fraud.html
    """
    # Select the appropriate retrieval method
    if method not in ['last_digit', 'first_digit']:
        raise ValueError('The method you\'ve requested is not supported.')

    def _get_first_digit(number):
        return int(str(number)[0])

    def _get_last_digit(number):
        return int(str(number)[-1])

    method_name = '_get_%s' % method
    method_obj = locals()[method_name]

    # Set the typical distributions we expect to find
    typical_distributions = {
        'first_digit': {},
        'last_digit': {}
    }
    for number in range(1, 10):
        log10 = math.log10(1 + 1 / float(number)) * 100.0
        typical_distributions['first_digit'].update({number: log10})

    typical_distributions['last_digit'].update({
        0: 10.0, 1: 10.0, 2: 10.0, 3: 10.0, 4: 10.0,
        5: 10.0, 6: 10.0, 7: 10.0, 8: 10.0, 9: 10.0,
    })

    # Fetch the digits we want to analyze
    digit_list = []
    for number in number_list:
        digit = method_obj(number)
        digit_list.append(digit)

    # Loop through the data set and grab all the applicable numbers
    results = []
    for number in range(0, 10):
        count = digit_list.count(number)
        try:
            expected_percentage = typical_distributions[method][number]
        except KeyError:
            continue
        actual_percentage = count / float(len(digit_list)) * 100.0
        results.append([number, count, expected_percentage, actual_percentage])

    # Run the two percentage figures through
    # Pearson's correlation coefficient to
    # see how closely related they are.
    list_one = [i[2] for i in results]
    list_two = typical_distributions[method]
    pearsons_r = calculate.pearson(list_one, list_two)

    # If the user has asked for verbosity,
    # print out this cutsey table with all
    # of the data.
    if verbose:
        from calculate import ptable
        # Convert results to strings
        results = [list(map(str, i)) for i in results]
        # Print everything out using our pretty table module
        labels = [
            'Number', 'Count', 'Expected Percentage', 'Actual Percentage'
        ]
        print("BENFORD'S LAW: %s" % method.upper().replace('_', ' '))
        print("")
        print("Pearson's r: %s" % (pearsons_r))
        print("")
        print(ptable.indent(
            [labels] + results,
            hasHeader=True,
            separateRows=False,
            prefix='| ', postfix=' |',
        ))

    return pearsons_r