Ejemplo n.º 1
0
def ChiSquared(target_frequency):
    """Score a text by comparing its frequency distribution against another.

    Note:
        It is easy to be penalised without knowing it when using this scorer.
        English frequency ngrams are capital letters, meaning when using it
        any text you score against must be all capitals for it to give correct results.
        I am aware of the issue and will work on a fix.

    Todo:
        Maybe include paramter for ngram size. Havent had a use case for this yet.
        Once there is evidence it is needed, I will add it.

    Example:
        >>> fitness = ChiSquared(english.unigrams)
        >>> fitness("ABC")
        -32.2

    Args:
        target_frequency (dict): symbol to frequency mapping of the distribution to compare with
    """
    return lambda text: -chi_squared(frequency_analyze(text), target_frequency)
Ejemplo n.º 2
0
def test_frequency_analyze():
    """Testing frequency analyze works for ngram = 1"""
    assert frequency.frequency_analyze("abb") == {'a': 1, 'b': 2}
Ejemplo n.º 3
0
def test_frequency_analyze_empty_string():
    """Testing empty string can be frequency analyzed"""
    assert frequency.frequency_analyze("") == {}
Ejemplo n.º 4
0
def test_frequency_analyze_bigram():
    """Testing frequency analyze works for ngram = 2"""
    assert frequency.frequency_analyze("abb", 2) == {'ab': 1, 'bb': 1}
Ejemplo n.º 5
0
 def inner(text):
     text = ''.join(text)
     return -chi_squared(frequency_analyze(text), target_frequency)
Ejemplo n.º 6
0
import sys
import argparse
from lantern.analysis import frequency

parser = argparse.ArgumentParser()
parser.add_argument("ctext")
parser.add_argument("-n", type=int)
parser.add_argument("-m", type=int)
args = parser.parse_args()

counts = frequency.frequency_analyze(args.ctext, args.n)
counts = sorted(counts.items(), key = lambda x : x[1])

m = args.m if args.m else 10
print(counts[:m])