def prune(self, min_frequency=5): ''' Remove all tokens that have been observed fewer than min_frequency times. Counts for tokens that are removed are attributed to UNK. ''' counts = [] tokens = [] for idx, token in enumerate(self.token_map.tokens): # Copy over tokens that have at least min_frequency # observations. Also copy over UNK no matter what it's # frequency. if ( self.counter_sampler.get_frequency(idx) >= min_frequency or idx == 0 ): tokens.append(token) counts.append(self.get_frequency(idx)) # Skip tokens that have too little frequency. Attribute their # observations to UNK else: counts[UNK] += self.get_frequency(idx) # Create a new TokenMap and CounterFrequency based on the # filtered tokens and their counts self.token_map = TokenMap(on_unk=self.on_unk, tokens=tokens) self.counter_sampler = CounterSampler(counts=counts)
def load(self, loaddir): ''' Load a UnigramDictionary from the specified directory, by loading the TokenMap and CounterSampler stored there. This assumes the filenames are 'token-map.gz' and 'counter-sampler.gz'. ''' # Load the TokenMap by delegation to its load function self.token_map = TokenMap() self.token_map.load(os.path.join(loaddir, 'token-map.gz')) # Load the CounterSampler by delegation to its load function self.counter_sampler = CounterSampler() self.counter_sampler.load(os.path.join(loaddir, 'counter-sampler.gz'))
def __init__(self, on_unk=WARN, token_map=None, counter_sampler=None): ''' Create a new UnigramDictionary. Typical usage provides no arguments, but a token_map and counter_sampler can be provided to build a UnigramDictionary that comprises them. ''' self.on_unk = on_unk self.token_map = token_map if token_map is None: self.token_map = TokenMap(on_unk=on_unk) self.counter_sampler = counter_sampler if counter_sampler is None: self.counter_sampler = CounterSampler()
def sort(self): unk_count = self.counter_sampler.counts[0] # Get the counts and tokens (skipping the first UNK entry) # They are parallel arrays (ith count corresponds to ith token) counts = self.counter_sampler.counts[1:] tokens = self.token_map.tokens[1:] # Zip them together and sort by counts token_counts = zip(counts, tokens) token_counts.sort(reverse=True) # Separate them again new_counts = [unk_count] new_tokens = ['UNK'] for count, token in token_counts: new_counts.append(count) new_tokens.append(token) # Rebuild the token_map and counter_sampler on the sorted arrays self.token_map = TokenMap(on_unk=self.on_unk, tokens=new_tokens) self.counter_sampler = CounterSampler(counts=new_counts)