def voc_richness(sentences): """Return ratio different words / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter((map(lambda word: stemmer.stem(word).lower(), s))) return len(counter) / sum(counter.values())
def voc_dis_legomenon(sentences): """Return ratio words occuring twice / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) twice = filter(lambda word: counter[word] == 2, counter) sum_counter = sum(counter.values()) # print "twice {} / total {}".format(len(twice), sum_counter) return len(twice) / sum_counter
def voc_hapax_legomenon(sentences): """Return ratio unique words / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) unique = filter(lambda word: counter[word] == 1, counter) sum_counter = sum(counter.values()) # print "unique {} / total {}".format(len(unique), sum_counter) return len(unique) / sum_counter
def voc_yule(sentences): """Return modified yule's I measure.""" stemmer = PorterStemmer() counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) M1 = float(len(counter)) # print "M1 {}".format(M1) M2 = sum( [len(list(g)) * (freq**2) for freq, g in groupby(counter.values())]) try: return ((M1 * M2) / (M2 - M1)) / sum(counter.values()) except ZeroDivisionError: return 0.0
def voc_bottom10(sentences): """Return ratio least frequent words (bottom 10%) / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) # least common 10% of occuring words bottom_10 = max(int(round(len(counter) * 0.1)), 1) least_common_10 = counter.most_common()[:-bottom_10 - 1:-1] # sum_least_common = sum(least_common_10.values()) # print "least_common_10" # print least_common_10 sum_least_common = sum([lc[1] for lc in least_common_10]) sum_counter = sum(counter.values()) # print "least common {} / total {}".format(sum_least_common, sum_counter) return sum_least_common / sum_counter