Esempio n. 1
0
def build_word_count():
    if os.path.isfile('pickled/wcount.pickle'):
        return read_pickle('pickled/wcount.pickle')
    wcount = Counter()
    for fid in words.fileids():
        for word in words.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in gutenberg.fileids():
        for word in gutenberg.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in brown.fileids():
        for word in brown.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in reuters.fileids():
        for word in reuters.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in inaugural.fileids():
        for word in inaugural.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    dump_pickle(wcount, 'pickled/wcount.pickle')
    return wcount
Esempio n. 2
0
import nltk
from nltk.corpus import words
print(words.fileids())
print(len(words.words('en')))
print(len(words.words('en-basic')))

import nltk
from nltk.corpus import words
print(words.fileids())
print(len(words.words('en')))
print(len(words.words('en-basic')))

Esempio n. 4
0
most_rated_language = max(language_ratios, key=language_ratios.get) # The key parameter to the max() function is a function that computes a key. In our case, we already have a key so we set key to languages_ratios.get which actually returns the key.
most_rated_language

test_words_set.intersection(set(stopwords.words(most_rated_language))) # We can see which English stop words were found.

######################fiding unusual words in given language
text = "Truly Kryptic is the best puzzle game. It's browser-based and free. Google it."
from nltk import word_tokenize
text_tokenized = word_tokenize(text.lower())
text_tokenized

###importing words
from nltk.corpus import words
words.readme().replace('\n', ' ')
words.fileids()

words.words('en')[:10]
words.words('en-basic')[:10]

##finding unusual words

english_vocab = set(w.lower() for w in words.words())
text_vocab = set(w.lower() for w in text_tokenized if w.isalpha()) # Note .isalpha() removes punctuation tokens. However, tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false.
unusual = text_vocab.difference(english_vocab)
unusual


####creating POS tagger

##We can train a classifier to work out which suffixes are most informative for POS tagging. We can begin by finding out what the most common suffixes are
Esempio n. 5
0
# import nltk
# from nltk.collocations import BigramCollocationFinder
# from nltk.collocations import TrigramCollocationFinder
# from nltk.metrics import BigramAssocMeasures
# from nltk.metrics import TrigramAssocMeasures
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.corpus import wordnet
import nltk.util
print("NLTK import end")
import logging
# from gensim import corpora, models, similarities

# gensimoutdir = "/XXXprojects/cch/foresight/dat/gensim"

print(words.fileids())
for fid in words.fileids():
    print(fid, len(words.words(fid)))
    print(words.words(fid)[:5], "...", words.words(fid)[-5:])

# for k in wordnet.all_lemma_names():
#     print(k)
all_lemma_list = [k for k in wordnet.all_lemma_names()]
print(len(all_lemma_list))

# for k in wordnet.all_synsets():
#     print(k)
# all_synsets_list = [k for k in wordnet.all_synsets()]
# print(len(all_synsets_list))

Esempio n. 6
0
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import words
##################################################################
## 简单查看
print(type(words))  # <class 'nltk.corpus.reader.wordlist.WordListCorpusReader'>
print(words.fileids())  # ['en', 'en-basic'], 就两个
print(words.abspath('en'))  # /home/coder352/nltk_data/corpora/words/en
print(len(words.words('en')))  # 235886; 个英语单词
print(type(words.words('en')))  # <class 'list'>
print([word for word in words.words('en') if len(word) == 1])  # 26 个英文字母, 大小写
print(len(words.words('en-basic')))  # 2850; 个英语基础词汇
##################################################################
## 查看 gutenberg 中的错别字
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)
print(unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt')))
# ['abbeyland', 'abhorred', 'abilities', 'abounded', 'abridgement', 'abused', 'abuses',
# 'accents', 'accepting', 'accommodations', 'accompanied', 'accounted', 'accounts',
# 'accustomary', 'aches', 'acknowledging', 'acknowledgment', 'acknowledgments', ...]
Esempio n. 7
0
########## WORDLIST CORPUS READER ###############

#Basic Corpus Reader
from nltk.corpus.reader import WordListCorpusReader
#List of a few thousand names organized by gender
from nltk.corpus import names
#List of english words
from nltk.corpus import words

nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
#nltkFile="mywords.txt"
#source=nltkDir+nltkFile

### One File WordListCorpusReader
reader=WordListCorpusReader(nltkDir,['wordlist.txt'])
print reader.words()
print reader.fileids()

### MultiFile WordListCorpusReader
#To get the names of the files in the corpus use the "fileids" command
names.fileids()
print len(names.words('female.txt'))
print len(names.words('female.txt'))

words.fileids()
print len(words.words('en-basic'))
print len(words.words('en'))

###Chunked Corpus Reader