def build_word_count(): if os.path.isfile('pickled/wcount.pickle'): return read_pickle('pickled/wcount.pickle') wcount = Counter() for fid in words.fileids(): for word in words.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in gutenberg.fileids(): for word in gutenberg.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in brown.fileids(): for word in brown.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in reuters.fileids(): for word in reuters.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in inaugural.fileids(): for word in inaugural.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 dump_pickle(wcount, 'pickled/wcount.pickle') return wcount
import nltk from nltk.corpus import words print(words.fileids()) print(len(words.words('en'))) print(len(words.words('en-basic')))
most_rated_language = max(language_ratios, key=language_ratios.get) # The key parameter to the max() function is a function that computes a key. In our case, we already have a key so we set key to languages_ratios.get which actually returns the key. most_rated_language test_words_set.intersection(set(stopwords.words(most_rated_language))) # We can see which English stop words were found. ######################fiding unusual words in given language text = "Truly Kryptic is the best puzzle game. It's browser-based and free. Google it." from nltk import word_tokenize text_tokenized = word_tokenize(text.lower()) text_tokenized ###importing words from nltk.corpus import words words.readme().replace('\n', ' ') words.fileids() words.words('en')[:10] words.words('en-basic')[:10] ##finding unusual words english_vocab = set(w.lower() for w in words.words()) text_vocab = set(w.lower() for w in text_tokenized if w.isalpha()) # Note .isalpha() removes punctuation tokens. However, tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false. unusual = text_vocab.difference(english_vocab) unusual ####creating POS tagger ##We can train a classifier to work out which suffixes are most informative for POS tagging. We can begin by finding out what the most common suffixes are
# import nltk # from nltk.collocations import BigramCollocationFinder # from nltk.collocations import TrigramCollocationFinder # from nltk.metrics import BigramAssocMeasures # from nltk.metrics import TrigramAssocMeasures from nltk.corpus import stopwords from nltk.corpus import words from nltk.corpus import wordnet import nltk.util print("NLTK import end") import logging # from gensim import corpora, models, similarities # gensimoutdir = "/XXXprojects/cch/foresight/dat/gensim" print(words.fileids()) for fid in words.fileids(): print(fid, len(words.words(fid))) print(words.words(fid)[:5], "...", words.words(fid)[-5:]) # for k in wordnet.all_lemma_names(): # print(k) all_lemma_list = [k for k in wordnet.all_lemma_names()] print(len(all_lemma_list)) # for k in wordnet.all_synsets(): # print(k) # all_synsets_list = [k for k in wordnet.all_synsets()] # print(len(all_synsets_list))
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import words ################################################################## ## 简单查看 print(type(words)) # <class 'nltk.corpus.reader.wordlist.WordListCorpusReader'> print(words.fileids()) # ['en', 'en-basic'], 就两个 print(words.abspath('en')) # /home/coder352/nltk_data/corpora/words/en print(len(words.words('en'))) # 235886; 个英语单词 print(type(words.words('en'))) # <class 'list'> print([word for word in words.words('en') if len(word) == 1]) # 26 个英文字母, 大小写 print(len(words.words('en-basic'))) # 2850; 个英语基础词汇 ################################################################## ## 查看 gutenberg 中的错别字 def unusual_words(text): text_vocab = set(w.lower() for w in text if w.isalpha()) english_vocab = set(w.lower() for w in nltk.corpus.words.words()) unusual = text_vocab - english_vocab return sorted(unusual) print(unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))) # ['abbeyland', 'abhorred', 'abilities', 'abounded', 'abridgement', 'abused', 'abuses', # 'accents', 'accepting', 'accommodations', 'accompanied', 'accounted', 'accounts', # 'accustomary', 'aches', 'acknowledging', 'acknowledgment', 'acknowledgments', ...]
########## WORDLIST CORPUS READER ############### #Basic Corpus Reader from nltk.corpus.reader import WordListCorpusReader #List of a few thousand names organized by gender from nltk.corpus import names #List of english words from nltk.corpus import words nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #nltkFile="mywords.txt" #source=nltkDir+nltkFile ### One File WordListCorpusReader reader=WordListCorpusReader(nltkDir,['wordlist.txt']) print reader.words() print reader.fileids() ### MultiFile WordListCorpusReader #To get the names of the files in the corpus use the "fileids" command names.fileids() print len(names.words('female.txt')) print len(names.words('female.txt')) words.fileids() print len(words.words('en-basic')) print len(words.words('en')) ###Chunked Corpus Reader