def word_frequency(word, default_freq=0): """ Looks up the word's frequency in a modified version of the Google Books 1-grams list. The characters may be in any case (they'll be case-smashed to uppercase) and may include non-ASCII letters in UTF-8 or Unicode. Words appear in the list if they meet these criteria, which improve the compactness and accuracy of the list: - They consist entirely of letters, digits and/or ampersands - They contain at least one ASCII letter - They appear at least 1000 times in Google Books OR (they appear at least 40 times in Google Books and also appear in Wiktionary or WordNet) Apostrophes are assumed to be at the edge of the word, in which case they'll be stripped like they were in the Google data, or in the special token "n't" which is treated as "not". This matches the output of the tokenize() function. >>> word_frequency('normalization') 223058.0 >>> word_frequency('budap', default_freq=100.) 100.0 """ freqs = Wordlist.load('google-unigrams.txt') if " " in word: raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word) word = preprocess_text(word.strip("'")).upper() if word == "N'T": word = 'NOT' return freqs.get(word, default_freq)
def word_frequency(self, word, default_freq=0): """ Looks up the word's frequency in the Leeds Internet corpus for the appropriate language. FIXME: this returns 0 for words that stem differently in FreeLing when we use FreeLing frequencies, and that's most of the words """ freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang) word = self.snowball_stem(word) if " " in word: raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word) word = preprocess_text(word.strip("'")).lower() return freqs.get(word, default_freq)
def word_frequency(self, word, default_freq=0): """ Looks up the word's frequency in the Leeds Internet corpus for the appropriate language. FIXME: this returns 0 for words that stem differently in FreeLing when we use FreeLing frequencies, and that's most of the words """ freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang) word = self.snowball_stem(word) if " " in word: raise ValueError( "word_frequency only can only look up single words, but %r contains a space" % word) word = preprocess_text(word.strip("'")).lower() return freqs.get(word, default_freq)
def word_frequency(word, default_freq=0): """ Looks up the word's frequency in a modified version of the Google Books 1-grams list. The characters may be in any case (they'll be case-smashed to uppercase) and may include non-ASCII letters in UTF-8 or Unicode. Words appear in the list if they meet these criteria, which improve the compactness and accuracy of the list: - They consist entirely of letters, digits and/or ampersands - They contain at least one ASCII letter - They appear at least 1000 times in Google Books OR (they appear at least 40 times in Google Books and also appear in Wiktionary or WordNet) Apostrophes are assumed to be at the edge of the word, in which case they'll be stripped like they were in the Google data, or in the special token "n't" which is treated as "not". This matches the output of the tokenize() function. >>> word_frequency('normalization') 223058.0 >>> word_frequency('budap', default_freq=100.) 100.0 """ freqs = Wordlist.load('google-unigrams.txt') if " " in word: raise ValueError("word_frequency only can only look up single words, " "but %r contains a space" % word) word = preprocess_text(word.strip("'")).lower() if word == "n't": word = 'not' return freqs.get(word, default_freq)
def get_wordlist(): return Wordlist.load('leeds-internet-ja.txt')
def get_wordlist(): return Wordlist.load('google-unigrams.txt')
def get_wordlist(): return Wordlist.load("leeds-internet-ja.txt")