コード例 #1
0
ファイル: english.py プロジェクト: Brainsciences/metanl
def word_frequency(word, default_freq=0):
    """
    Looks up the word's frequency in a modified version of the Google Books
    1-grams list.

    The characters may be in any case (they'll be case-smashed
    to uppercase) and may include non-ASCII letters in UTF-8 or Unicode.

    Words appear in the list if they meet these criteria, which improve the
    compactness and accuracy of the list:

    - They consist entirely of letters, digits and/or ampersands
    - They contain at least one ASCII letter
    - They appear at least 1000 times in Google Books OR
      (they appear at least 40 times in Google Books and also appear in
      Wiktionary or WordNet)
    
    Apostrophes are assumed to be at the edge of the word,
    in which case they'll be stripped like they were in the Google data, or
    in the special token "n't" which is treated as "not". This matches the
    output of the tokenize() function.

    >>> word_frequency('normalization')
    223058.0

    >>> word_frequency('budap', default_freq=100.)
    100.0
    """
    freqs = Wordlist.load('google-unigrams.txt')
    if " " in word:
        raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word)
    word = preprocess_text(word.strip("'")).upper()
    if word == "N'T":
        word = 'NOT'
    return freqs.get(word, default_freq)
コード例 #2
0
ファイル: snowball.py プロジェクト: Brainsciences/metanl
    def word_frequency(self, word, default_freq=0):
        """
        Looks up the word's frequency in the Leeds Internet corpus for the
        appropriate language.

        FIXME: this returns 0 for words that stem differently in FreeLing when
        we use FreeLing frequencies, and that's most of the words
        """
        freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang)
        word = self.snowball_stem(word)
        if " " in word:
            raise ValueError("word_frequency only can only look up single words, but %r contains a space" % word)
        word = preprocess_text(word.strip("'")).lower()
        return freqs.get(word, default_freq)
コード例 #3
0
    def word_frequency(self, word, default_freq=0):
        """
        Looks up the word's frequency in the Leeds Internet corpus for the
        appropriate language.

        FIXME: this returns 0 for words that stem differently in FreeLing when
        we use FreeLing frequencies, and that's most of the words
        """
        freqs = Wordlist.load('leeds-internet-%s.txt' % self.lang)
        word = self.snowball_stem(word)
        if " " in word:
            raise ValueError(
                "word_frequency only can only look up single words, but %r contains a space"
                % word)
        word = preprocess_text(word.strip("'")).lower()
        return freqs.get(word, default_freq)
コード例 #4
0
ファイル: english.py プロジェクト: tazjel/metanl
def word_frequency(word, default_freq=0):
    """
    Looks up the word's frequency in a modified version of the Google Books
    1-grams list.

    The characters may be in any case (they'll be case-smashed
    to uppercase) and may include non-ASCII letters in UTF-8 or Unicode.

    Words appear in the list if they meet these criteria, which improve the
    compactness and accuracy of the list:

    - They consist entirely of letters, digits and/or ampersands
    - They contain at least one ASCII letter
    - They appear at least 1000 times in Google Books OR
      (they appear at least 40 times in Google Books and also appear in
      Wiktionary or WordNet)

    Apostrophes are assumed to be at the edge of the word,
    in which case they'll be stripped like they were in the Google data, or
    in the special token "n't" which is treated as "not". This matches the
    output of the tokenize() function.

    >>> word_frequency('normalization')
    223058.0

    >>> word_frequency('budap', default_freq=100.)
    100.0
    """
    freqs = Wordlist.load('google-unigrams.txt')
    if " " in word:
        raise ValueError("word_frequency only can only look up single words, "
                         "but %r contains a space" % word)
    word = preprocess_text(word.strip("'")).lower()
    if word == "n't":
        word = 'not'
    return freqs.get(word, default_freq)
コード例 #5
0
ファイル: japanese.py プロジェクト: tazjel/metanl
def get_wordlist():
    return Wordlist.load('leeds-internet-ja.txt')
コード例 #6
0
ファイル: english.py プロジェクト: tazjel/metanl
def get_wordlist():
    return Wordlist.load('google-unigrams.txt')
コード例 #7
0
ファイル: japanese.py プロジェクト: Web5design/metanl
def get_wordlist():
    return Wordlist.load("leeds-internet-ja.txt")
コード例 #8
0
ファイル: english.py プロジェクト: tazjel/metanl
def get_wordlist():
    return Wordlist.load('google-unigrams.txt')