def build_vocab_df(docs, min_len=2, stopwords=None, tokenizer=tokenize, min_dc=2, max_df=0.5): vocab = build_vocab(docs, min_len, stopwords, tokenizer) # Build the initial vocabulary dcount = build_dcount(docs, vocab) # Calculate the DF max_dc = max_df * len(docs) selected = [] for idx, (word, _) in enumerate(vocab.items()): if dcount[idx] >= min_dc and dcount[ idx] <= max_dc: # Check two DF conditions selected.append(word) vocab = {word: idx for idx, word in enumerate(selected)} # Re-build the vocabulary return vocab
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from nlp02_onehot_word import build_vocab from nlp02_onehot_doc import inverse_vocab from nlp02_bow_hand import build_dcount # Load the 20 newsgroup dataset remove = ('headers', 'footers', 'quotes') train = datasets.fetch_20newsgroups(subset='train', remove=remove) # Build a vocaburary and its document count vocab = build_vocab(train.data) vocab_inv = inverse_vocab(vocab) dcount = build_dcount(train.data, vocab) # Print statistics of the vocabulary print('### Statistics of the vocabulary') print(f'* The number of documents: {len(train.data)}') print(f'* The size of vocabulary: {len(vocab)}') print( f'* The averaged number of new words per a document: {len(vocab) / len(train.data):.3f}' ) print(f'* The range of document counts: ({dcount.min()}, {dcount.max()})') print(f'* The average of document counts: {dcount.mean():.3f}') # Plot the histogram of rare (low document count) words fig = plt.figure() dcount10 = dcount[dcount < 10] plt.hist(dcount10, bins=9, range=(1, 10), align='left') plt.ylim(0, len(vocab))