def mean_sent_len_char(sentences): """Return mean sentence length in characters.""" sentlen_char = map( lambda sentence: sum( map(lambda word: len(word), f_utils.remove_punctuation(sentence))), sentences) return sum(sentlen_char) / len(sentlen_char)
def digit_words_19(sentences): """Return ratio digit words <= 19 / digits <= 19 + single ratios / words.""" counter_digwords = Counter() counter_digits = Counter() total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) counter_digwords += Counter( filter(lambda word: word in _DIGITWORDS_19, s)) counter_digits += Counter( filter(lambda word: word.isdigit() and float(word) in range(0, 20), s)) sum_digwords = sum(counter_digwords.values()) sum_digits = sum(counter_digits.values()) total = sum_digwords + sum_digits if total == 0: yield "digit_words_19", 0 else: yield "digit_words_19", sum_digwords / total_words yield "digits_19", sum_digits / total_words for key in counter_digwords.iterkeys(): yield "digit_words_19_" + key, counter_digwords[key] / total for key in counter_digits.iterkeys(): yield "digits_19_" + key, counter_digits[key] / total
def medium_words(sentences): """Return medium long word ratio.""" mediumwords = [] total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) mediumwords += filter(lambda word: len(word) in range(3, 8), s) total_words += len(s) return len(mediumwords) / total_words
def long_words(sentences): """Return long word ratio.""" longwords = [] total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) longwords += filter(lambda word: len(word) > 7, s) total_words += len(s) return len(longwords) / total_words
def short_words(sentences): """Return short word ratio.""" shortwords = [] total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) shortwords += filter(lambda word: len(word) <= 3, s) total_words += len(s) return len(shortwords) / total_words
def apostrophes(sentences): """Return ratio of words which contain apostrophes / total words.""" words_with_apos = [] total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) words_with_apos += filter(lambda word: "'" in word, s) total_words += len(s) return len(words_with_apos) / total_words
def voc_richness(sentences): """Return ratio different words / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter((map(lambda word: stemmer.stem(word).lower(), s))) return len(counter) / sum(counter.values())
def mean_word_len(sentences): """Return mean word length.""" wordlen = [] for s in sentences: s = f_utils.remove_punctuation(s) wordlen += map(lambda word: len(word), s) if len(wordlen) == 0: return 0 return sum(wordlen) / len(wordlen)
def char_freq(sentences): """Return character frequencies.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) for w in s: counter += Counter(w.lower()) sum_char = sum(counter.values()) for k in counter.iterkeys(): yield "char_freq_" + k, counter[k] / sum_char
def vowels(sentences): """Return ratio vowels / total alpha-chars.""" alphas = [] for s in sentences: s = f_utils.remove_punctuation(s) for w in s: alphas += filter(lambda char: char.isalpha(), w) num_vowels = filter(lambda vow: vow.lower() in "aeiou", alphas) if len(alphas) == 0: return 0 return len(num_vowels) / len(alphas)
def char_bigrams(sentences): """Return character bigram frequencies.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) for w in s: bigrams = nltk.ngrams(w.lower(), 2) bigrams = [a + '_' + b for a, b in bigrams] counter += Counter(bigrams) sum_counter = sum(counter.values()) for k in counter.iterkeys(): yield "char_bigram_" + k, counter[k] / sum_counter
def word_len_freq(sentences): """Return relative frequency of 1-20 letter words.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) wordlen = map(lambda word: len(word), s) words_shorter20 = filter(lambda wordlen: wordlen <= 20, wordlen) counter += Counter(map(lambda word: str(word), words_shorter20)) sum_words = sum(counter.values()) for key in counter.iterkeys(): yield "word_len_freq_" + key, counter[key] / sum_words
def sent_len_4grams(sentences): """Return sentence length 4-gram frequecys.""" sent_len = map( lambda sentence: f_utils.tag_sent_len( len(f_utils.remove_punctuation(sentence))), sentences) fourgram = nltk.util.ngrams(sent_len, 4) fourgram = [a + b + c + d for a, b, c, d in fourgram] counter = Counter(fourgram) sum_counter = sum(counter.values()) for key in counter.iterkeys(): yield "sent_len_4gram_" + key, counter[key] / sum_counter
def sent_len_bigrams(sentences): """Return sentence length bigram frequencys.""" sent_len = map( lambda sentence: f_utils.tag_sent_len( len(f_utils.remove_punctuation(sentence))), sentences) bigrams = nltk.util.ngrams(sent_len, 2) bigrams = [a + b for a, b in bigrams] counter = Counter(bigrams) sum_counter = sum(counter.values()) for key in counter.iterkeys(): yield "sent_len_bigram_" + key, counter[key] / sum_counter
def word_len_bigrams(sentences): """Return word length bigram frequencys.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) wordlen = map(lambda word: len(word), s) bigrams = nltk.util.ngrams(wordlen, 2) bigrams = [str(a) + '_' + str(b) for a, b in bigrams] counter += Counter(bigrams) sum_bigrams = sum(counter.values()) for key in counter.iterkeys(): yield "word_len_bigram_" + key, counter[key] / sum_bigrams
def voc_hapax_legomenon(sentences): """Return ratio unique words / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) unique = filter(lambda word: counter[word] == 1, counter) sum_counter = sum(counter.values()) # print "unique {} / total {}".format(len(unique), sum_counter) return len(unique) / sum_counter
def voc_dis_legomenon(sentences): """Return ratio words occuring twice / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) twice = filter(lambda word: counter[word] == 2, counter) sum_counter = sum(counter.values()) # print "twice {} / total {}".format(len(twice), sum_counter) return len(twice) / sum_counter
def char_4grams(sentences): """Return character 4-gram frequencies.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) for w in s: fourgram = nltk.ngrams(w.lower(), 4) fourgram = [ a + '_' + b + '_' + c + '_' + d for a, b, c, d in fourgram ] counter += Counter(fourgram) sum_counter = sum(counter.values()) for k in counter.iterkeys(): yield "char_4gram_" + k, counter[k] / sum_counter
def ordinals(sentences): """Return ratio ordinals in form dd+['th', 'st', 'nd', 'rd'] / total words.""" counter = Counter() total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) counter += Counter(filter(lambda word: f_utils.is_ordinal(word), s)) sum_ordinals = sum(counter.values()) if sum_ordinals == 0: return 0 else: return sum_ordinals / total_words
def char_trigrams(sentences): """Return character trigram frequencies.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) for w in s: if len(w) <= 3 or not w.isalpha(): continue trigram = nltk.ngrams(w.lower(), 3) trigram = [a + '_' + b + '_' + c for a, b, c in trigram] counter += Counter(trigram) sum_counter = sum(counter.values()) for k in counter.iterkeys(): yield "char_trigram_" + k, counter[k] / sum_counter
def percent(sentences): """Return ratio of word 'percent' & '%' / total words.""" counter = Counter() total_words = 0 for s in sentences: counter += Counter( filter(lambda word: word.lower() == "percent" or word == "%", s)) s = f_utils.remove_punctuation(s) total_words += len(s) sum_counter = sum(counter.values()) if sum_counter == 0: yield "percent", 0 else: for key in counter.iterkeys(): yield key, counter[key] / total_words
def fraction_words(sentences): """Return ratio fraction words / total words.""" counter = Counter() total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) counter += Counter( filter(lambda word: f_utils.endswith_fractionword(word), s)) sum_fraction_words = sum(counter.values()) if sum_fraction_words == 0: return 0 else: return sum_fraction_words / total_words
def multiplicat_num(sentences): """Return ratio of multiplicative numbers (once, twice, thrice) / total words.""" counter = Counter() total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) counter += Counter( filter(lambda word: word.lower() in _MULTIPLICAT_NUM, s)) sum_counter = sum(counter.values()) if sum_counter == 0: yield "times_" + "once", 0 else: for key in counter.iterkeys(): yield "times_" + key, counter[key] / total_words
def word_len_4grams(sentences): """Return word length 4-gram frequecys.""" counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) wordlen = map(lambda word: len(word), s) fourgram = nltk.util.ngrams(wordlen, 4) fourgram = [ str(a) + '_' + str(b) + '_' + str(c) + '_' + str(d) for a, b, c, d in fourgram ] counter += Counter(fourgram) sum_4grams = sum(counter.values()) for key in counter.iterkeys(): yield "word_len_4gram_" + key, counter[key] / sum_4grams
def freq_consonants(sentences): """Return ratio of most frequent consonant groups / total alpha-chars.""" alphas = [] for s in sentences: s = f_utils.remove_punctuation(s) for w in s: alphas += filter(lambda char: char.isalpha(), w) cons_tnsrh = filter(lambda cons: cons.lower() in "tnsrh", alphas) cons_ldcpf = filter(lambda cons: cons.lower() in "ldcpf", alphas) cons_mwybg = filter(lambda cons: cons.lower() in "mwybg", alphas) cons_jkqvxz = filter(lambda cons: cons.lower() in "jkqvxz", alphas) yield "freq_cons_tnsrh", len(cons_tnsrh) / len(alphas) yield "freq_cons_ldcpf", len(cons_ldcpf) / len(alphas) yield "freq_cons_mwybg", len(cons_mwybg) / len(alphas) yield "freq_cons_jkqvxz", len(cons_jkqvxz) / len(alphas)
def auxiliary_verbs(sentences): """Return ratio auxiliary verb / all auxiliary verbs + ratio all auxiliary verbs / total words. """ total_words = 0 counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) for key in _VERBS_AUX.iterkeys(): counter += Counter( {key: len(filter(lambda word: word in _VERBS_AUX[key], s))}) sum_counter = sum(counter.values()) for key in counter.iterkeys(): yield "auxverb_" + key, counter[key] / sum_counter yield "auxiliary_verbs", sum_counter / total_words
def num_times(sentences): """Return ratio of one/two/three time(s) / total words.""" counter = Counter() total_words = 0 for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) s = map(lambda word: word.lower(), s) bigrams = nltk.ngrams(s, 2) counter += Counter( filter(lambda bigram: bigram in _NUMTIMES_BIGRAMS, bigrams)) sum_counter = sum(counter.values()) if sum_counter == 0: yield "times_" + "one_time", 0 else: for key in counter.iterkeys(): yield "times_" + key[0] + "_" + key[1], counter[key] / total_words
def voc_yule(sentences): """Return modified yule's I measure.""" stemmer = PorterStemmer() counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) M1 = float(len(counter)) # print "M1 {}".format(M1) M2 = sum( [len(list(g)) * (freq**2) for freq, g in groupby(counter.values())]) try: return ((M1 * M2) / (M2 - M1)) / sum(counter.values()) except ZeroDivisionError: return 0.0
def voc_bottom10(sentences): """Return ratio least frequent words (bottom 10%) / total words.""" counter = Counter() stemmer = PorterStemmer() for s in sentences: s = f_utils.remove_punctuation(s) s = f_utils.remove_digits(s) counter += Counter(map(lambda word: stemmer.stem(word).lower(), s)) # least common 10% of occuring words bottom_10 = max(int(round(len(counter) * 0.1)), 1) least_common_10 = counter.most_common()[:-bottom_10 - 1:-1] # sum_least_common = sum(least_common_10.values()) # print "least_common_10" # print least_common_10 sum_least_common = sum([lc[1] for lc in least_common_10]) sum_counter = sum(counter.values()) # print "least common {} / total {}".format(sum_least_common, sum_counter) return sum_least_common / sum_counter
def primary_verbs(sentences): """Return ratio primary verb / all primary verbs + ratio all primary verbs / total words. """ total_words = 0 counter = Counter() for s in sentences: s = f_utils.remove_punctuation(s) total_words += len(s) counter += Counter( {"verb_tobe": len(filter(lambda word: word in _VERBS_TOBE, s))}) counter += Counter( {"verb_todo": len(filter(lambda word: word in _VERBS_TODO, s))}) counter += Counter({ "verb_tohave": len(filter(lambda word: word in _VERBS_TOHAVE, s)) }) sum_counter = sum(counter.values()) for key in counter.iterkeys(): yield key, counter[key] / sum_counter yield "primary_verbs", sum_counter / total_words