def prune(self, min_frequency=5): ''' Remove all tokens that have been observed fewer than min_frequency times. Counts for tokens that are removed are attributed to UNK. ''' counts = [] tokens = [] for idx, token in enumerate(self.token_map.tokens): # Copy over tokens that have at least min_frequency # observations. Also copy over UNK no matter what it's # frequency. if ( self.counter_sampler.get_frequency(idx) >= min_frequency or idx == 0 ): tokens.append(token) counts.append(self.get_frequency(idx)) # Skip tokens that have too little frequency. Attribute their # observations to UNK else: counts[UNK] += self.get_frequency(idx) # Create a new TokenMap and CounterFrequency based on the # filtered tokens and their counts self.token_map = TokenMap(on_unk=self.on_unk, tokens=tokens) self.counter_sampler = CounterSampler(counts=counts)
def load(self, loaddir): ''' Load a UnigramDictionary from the specified directory, by loading the TokenMap and CounterSampler stored there. This assumes the filenames are 'token-map.gz' and 'counter-sampler.gz'. ''' # Load the TokenMap by delegation to its load function self.token_map = TokenMap() self.token_map.load(os.path.join(loaddir, 'token-map.gz')) # Load the CounterSampler by delegation to its load function self.counter_sampler = CounterSampler() self.counter_sampler.load(os.path.join(loaddir, 'counter-sampler.gz'))
def __init__(self, on_unk=WARN, token_map=None, counter_sampler=None): ''' Create a new UnigramDictionary. Typical usage provides no arguments, but a token_map and counter_sampler can be provided to build a UnigramDictionary that comprises them. ''' self.on_unk = on_unk self.token_map = token_map if token_map is None: self.token_map = TokenMap(on_unk=on_unk) self.counter_sampler = counter_sampler if counter_sampler is None: self.counter_sampler = CounterSampler()
def sort(self): unk_count = self.counter_sampler.counts[0] # Get the counts and tokens (skipping the first UNK entry) # They are parallel arrays (ith count corresponds to ith token) counts = self.counter_sampler.counts[1:] tokens = self.token_map.tokens[1:] # Zip them together and sort by counts token_counts = zip(counts, tokens) token_counts.sort(reverse=True) # Separate them again new_counts = [unk_count] new_tokens = ['UNK'] for count, token in token_counts: new_counts.append(count) new_tokens.append(token) # Rebuild the token_map and counter_sampler on the sorted arrays self.token_map = TokenMap(on_unk=self.on_unk, tokens=new_tokens) self.counter_sampler = CounterSampler(counts=new_counts)
def choose_token(self, idx, length): ''' Randomly choose a token according to the kernel supplied in the constructor. Note that when sampling the context near the beginning of a sentence, the left part of the context window will be truncated. Similarly, sampling context near the end of a sentence leads to truncation of the right part of the context window. Short sentences lead to truncation on both sides. To ensure that samples are returned within the possibly truncated window, two values define the actual extent of the context to be sampled: `idx`: index of the query word within the context. E.g. if the valid context is constrained to a sentence, and the query word is the 3rd token in the sentence, idx should be 2 (because of 0-based indexing) `length`: length of the the context, E.g. If context is constrained to a sentence, and sentence is 7 tokens long, length should be 7. ''' # If the token is near the edges of the context, then the # sampling kernel will be truncated (we can't sample before the # firs word in the sentence, or after the last word). # Determine the slice indices that define the truncated kernel. negative_idx = length - idx start = max(0, self.K - idx) stop = min(2 * self.K, self.K + negative_idx - 1) # We make a separate multinomial sampler for each different # truncation of the kernel, because they each define a different # set of sampling probabilityes. If we don't have a sampler for # this particular kernel shape, make one. if not (start, stop) in self.samplers: trunc_probabilities = self.kernel[start:stop] self.samplers[start, stop] = (CounterSampler(trunc_probabilities)) # Sample from the multinomial sampler for the context of this shape outcome_idx = self.samplers[start, stop].sample() # Map this into the +/- indexing relative to the query word relative_idx = self.indices[outcome_idx + start] # And then map this into absolute indexing result_idx = relative_idx + idx return result_idx
class UnigramDictionary(object): ''' Bundles together a TokenMap and CounterSampler. Provides a method for pruning the vocabluary while keeping the TokenMap and CounterSampler in sync with one another. ''' def __init__(self, on_unk=WARN, token_map=None, counter_sampler=None): ''' Create a new UnigramDictionary. Typical usage provides no arguments, but a token_map and counter_sampler can be provided to build a UnigramDictionary that comprises them. ''' self.on_unk = on_unk self.token_map = token_map if token_map is None: self.token_map = TokenMap(on_unk=on_unk) self.counter_sampler = counter_sampler if counter_sampler is None: self.counter_sampler = CounterSampler() def sort(self): unk_count = self.counter_sampler.counts[0] # Get the counts and tokens (skipping the first UNK entry) # They are parallel arrays (ith count corresponds to ith token) counts = self.counter_sampler.counts[1:] tokens = self.token_map.tokens[1:] # Zip them together and sort by counts token_counts = zip(counts, tokens) token_counts.sort(reverse=True) # Separate them again new_counts = [unk_count] new_tokens = ['UNK'] for count, token in token_counts: new_counts.append(count) new_tokens.append(token) # Rebuild the token_map and counter_sampler on the sorted arrays self.token_map = TokenMap(on_unk=self.on_unk, tokens=new_tokens) self.counter_sampler = CounterSampler(counts=new_counts) def remove(self, token): idx = self.get_id(token) self.token_map.remove(token) self.counter_sampler.remove(idx) def compact(self): self.token_map.compact() self.counter_sampler.compact() def prune(self, min_frequency=5): ''' Remove all tokens that have been observed fewer than min_frequency times. Counts for tokens that are removed are attributed to UNK. ''' counts = [] tokens = [] for idx, token in enumerate(self.token_map.tokens): # Copy over tokens that have at least min_frequency # observations. Also copy over UNK no matter what it's # frequency. if ( self.counter_sampler.get_frequency(idx) >= min_frequency or idx == 0 ): tokens.append(token) counts.append(self.get_frequency(idx)) # Skip tokens that have too little frequency. Attribute their # observations to UNK else: counts[UNK] += self.get_frequency(idx) # Create a new TokenMap and CounterFrequency based on the # filtered tokens and their counts self.token_map = TokenMap(on_unk=self.on_unk, tokens=tokens) self.counter_sampler = CounterSampler(counts=counts) def add(self, token): ''' Add a new token. If this "token type" (which means this specific spelling of a word) has not been seen before, add it to the mapping. Also increment the count for that token type. Return its ID under the token mapping. ''' # Get or create an id for this token token_id = self.token_map.add(token) # Increment the frequency count self.counter_sampler.add(token_id) return token_id def get_vocab_size(self): ''' Return the number of unique tokens in the token_map. ''' return len(self.token_map) def get_num_tokens(self): ''' Return the total number of (non-distinct) tokens observed. ''' return len(self.counter_sampler) def __len__(self): ''' Same as get_vocab_size(). Return the number of unique tokens in the token_map. ''' return len(self.token_map) def update(self, token_iterable): return [self.add(token) for token in token_iterable] def get_id(self, token): ''' Get the id (int) for the corresponding token (string). ''' # Delegate to the underlying token_map. return self.token_map.get_id(token) def get_ids(self, token_iterable): ''' Get the ids (list of ints) for the corresponding tokens (strings) issued by token_iterable. ''' # Delegate to the underlying token map. return self.token_map.get_ids(token_iterable) def get_token(self, idx): ''' Return token (string) for the corresponding id (int) ''' # Delegate to the underlying token map return self.token_map.get_token(idx) def get_tokens(self, idx_iterable): ''' Return the tokens (list of strings) for the corresponding ids (ints) issued by idx_iterable. ''' # Delegate to the underlying token map. return self.token_map.get_tokens(idx_iterable) def save(self, savedir): ''' Save the UnigramDictionary to the directory specified. This saves the underlying TokenMap and CounterSampler in the directory given (savedir), using the default filenames "token-map.gz" and "counter-sampler.gz". ''' # If the directory provided is a file, raise an error if os.path.exists(savedir): if os.path.isfile(savedir): raise IOError( 'Directory specified for saving UnigramDictionary is a ' 'file.' ) # If the directory provided doesn't exist, make it (this will not # make parent directories though). else: os.mkdir(savedir) # Save the TokenMap and CounterSampler by delegating to their # save functions. self.token_map.save(os.path.join(savedir, 'token-map.gz')) self.counter_sampler.save(os.path.join( savedir, 'counter-sampler.gz' )) def load(self, loaddir): ''' Load a UnigramDictionary from the specified directory, by loading the TokenMap and CounterSampler stored there. This assumes the filenames are 'token-map.gz' and 'counter-sampler.gz'. ''' # Load the TokenMap by delegation to its load function self.token_map = TokenMap() self.token_map.load(os.path.join(loaddir, 'token-map.gz')) # Load the CounterSampler by delegation to its load function self.counter_sampler = CounterSampler() self.counter_sampler.load( os.path.join(loaddir, 'counter-sampler.gz')) def sample(self, shape=None): ''' Draw a sample according to the counter_sampler probability ''' # Delegate to the underlying CounterSampler return self.counter_sampler.sample(shape) def get_probability(self, token_id): ''' Return the probability associated to token_id. ''' # Delegate to the underlying CounterSampler return self.counter_sampler.get_probability(token_id) def get_frequency(self, token_id): ''' Return the frequency associated to token_id. ''' # Delegate to the underlying CounterSampler return self.counter_sampler.get_frequency(token_id)