def voc_richness(sentences):
    """Return ratio different words / total words."""
    counter = Counter()
    stemmer = PorterStemmer()

    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter((map(lambda word: stemmer.stem(word).lower(), s)))
    return len(counter) / sum(counter.values())
def voc_dis_legomenon(sentences):
    """Return ratio words occuring twice / total words."""
    counter = Counter()
    stemmer = PorterStemmer()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    twice = filter(lambda word: counter[word] == 2, counter)
    sum_counter = sum(counter.values())
    # print "twice {} / total {}".format(len(twice), sum_counter)
    return len(twice) / sum_counter
def voc_hapax_legomenon(sentences):
    """Return ratio unique words / total words."""
    counter = Counter()
    stemmer = PorterStemmer()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    unique = filter(lambda word: counter[word] == 1, counter)
    sum_counter = sum(counter.values())
    # print "unique {} / total {}".format(len(unique), sum_counter)
    return len(unique) / sum_counter
def voc_yule(sentences):
    """Return modified yule's I measure."""
    stemmer = PorterStemmer()
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    M1 = float(len(counter))
    # print "M1 {}".format(M1)
    M2 = sum(
        [len(list(g)) * (freq**2) for freq, g in groupby(counter.values())])

    try:
        return ((M1 * M2) / (M2 - M1)) / sum(counter.values())
    except ZeroDivisionError:
        return 0.0
def voc_bottom10(sentences):
    """Return ratio least frequent words (bottom 10%) / total words."""
    counter = Counter()
    stemmer = PorterStemmer()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    # least common 10% of occuring words
    bottom_10 = max(int(round(len(counter) * 0.1)), 1)
    least_common_10 = counter.most_common()[:-bottom_10 - 1:-1]
    # sum_least_common = sum(least_common_10.values())
    # print "least_common_10"
    # print least_common_10
    sum_least_common = sum([lc[1] for lc in least_common_10])

    sum_counter = sum(counter.values())
    # print "least common {} / total {}".format(sum_least_common, sum_counter)
    return sum_least_common / sum_counter