Esempio n. 1
0
	def prune(self, min_frequency=5):
		'''
		Remove all tokens that have been observed fewer than min_frequency
		times.  Counts for tokens that are removed are attributed to UNK.
		'''
		counts = []
		tokens = []
		for idx, token in enumerate(self.token_map.tokens):

			# Copy over tokens that have at least min_frequency
			# observations. Also copy over UNK no matter what it's
			# frequency.
			if (
				self.counter_sampler.get_frequency(idx) >= min_frequency
				or idx == 0
			):
				tokens.append(token)
				counts.append(self.get_frequency(idx))

			# Skip tokens that have too little frequency.  Attribute their
			# observations to UNK
			else:
				counts[UNK] += self.get_frequency(idx)

		# Create a new TokenMap and CounterFrequency based on the
		# filtered tokens and their counts
		self.token_map = TokenMap(on_unk=self.on_unk, tokens=tokens)
		self.counter_sampler = CounterSampler(counts=counts)
Esempio n. 2
0
    def load(self, loaddir):
        '''
		Load a UnigramDictionary from the specified directory, by
		loading the TokenMap and CounterSampler stored there.  This assumes
		the filenames are 'token-map.gz' and 'counter-sampler.gz'.
		'''
        # Load the TokenMap by delegation to its load function
        self.token_map = TokenMap()
        self.token_map.load(os.path.join(loaddir, 'token-map.gz'))

        # Load the CounterSampler by delegation to its load function
        self.counter_sampler = CounterSampler()
        self.counter_sampler.load(os.path.join(loaddir, 'counter-sampler.gz'))
Esempio n. 3
0
	def __init__(self, on_unk=WARN, token_map=None, counter_sampler=None):
		'''
		Create a new UnigramDictionary.  Typical usage provides no
		arguments, but a token_map and counter_sampler can be provided
		to build a UnigramDictionary that comprises them.
		'''
		self.on_unk = on_unk
		self.token_map = token_map
		if token_map is None:
			self.token_map = TokenMap(on_unk=on_unk)

		self.counter_sampler = counter_sampler
		if counter_sampler is None:
			self.counter_sampler = CounterSampler()
Esempio n. 4
0
	def sort(self):
		unk_count = self.counter_sampler.counts[0]

		# Get the counts and tokens (skipping the first UNK entry)
		# They are parallel arrays (ith count corresponds to ith token)
		counts = self.counter_sampler.counts[1:]
		tokens = self.token_map.tokens[1:]

		# Zip them together and sort by counts
		token_counts = zip(counts, tokens)
		token_counts.sort(reverse=True)

		# Separate them again
		new_counts = [unk_count]
		new_tokens = ['UNK']
		for count, token in token_counts:
			new_counts.append(count)
			new_tokens.append(token)

		# Rebuild the token_map and counter_sampler on the sorted arrays
		self.token_map = TokenMap(on_unk=self.on_unk, tokens=new_tokens)
		self.counter_sampler = CounterSampler(counts=new_counts)
Esempio n. 5
0
    def choose_token(self, idx, length):
        '''
		Randomly choose a token according to the kernel supplied
		in the constructor.  Note that when sampling the context near
		the beginning of a sentence, the left part of the context window
		will be truncated.  Similarly, sampling context near the end of
		a sentence leads to truncation of the right part of the context
		window.  Short sentences lead to truncation on both sides.

		To ensure that samples are returned within the possibly truncated
		window, two values define the actual extent of the context to be
		sampled:

		`idx`: index of the query word within the context.  E.g. if the
			valid context is constrained to a sentence, and the query word
			is the 3rd token in the sentence, idx should be 2 (because
			of 0-based indexing)

		`length`: length of the the context, E.g. If context is
			constrained to a sentence, and sentence is 7 tokens long,
			length should be 7.
		'''

        # If the token is near the edges of the context, then the
        # sampling kernel will be truncated (we can't sample before the
        # firs word in the sentence, or after the last word).
        # Determine the slice indices that define the truncated kernel.
        negative_idx = length - idx
        start = max(0, self.K - idx)
        stop = min(2 * self.K, self.K + negative_idx - 1)

        # We make a separate multinomial sampler for each different
        # truncation of the kernel, because they each define a different
        # set of sampling probabilityes.  If we don't have a sampler for
        # this particular kernel shape, make one.
        if not (start, stop) in self.samplers:

            trunc_probabilities = self.kernel[start:stop]
            self.samplers[start, stop] = (CounterSampler(trunc_probabilities))

        # Sample from the multinomial sampler for the context of this shape
        outcome_idx = self.samplers[start, stop].sample()

        # Map this into the +/- indexing relative to the query word
        relative_idx = self.indices[outcome_idx + start]

        # And then map this into absolute indexing
        result_idx = relative_idx + idx

        return result_idx
Esempio n. 6
0
class UnigramDictionary(object):
	'''
	Bundles together a TokenMap and CounterSampler.  Provides a method for
	pruning the vocabluary while keeping the TokenMap and CounterSampler
	in sync with one another.
	'''


	def __init__(self, on_unk=WARN, token_map=None, counter_sampler=None):
		'''
		Create a new UnigramDictionary.  Typical usage provides no
		arguments, but a token_map and counter_sampler can be provided
		to build a UnigramDictionary that comprises them.
		'''
		self.on_unk = on_unk
		self.token_map = token_map
		if token_map is None:
			self.token_map = TokenMap(on_unk=on_unk)

		self.counter_sampler = counter_sampler
		if counter_sampler is None:
			self.counter_sampler = CounterSampler()


	def sort(self):
		unk_count = self.counter_sampler.counts[0]

		# Get the counts and tokens (skipping the first UNK entry)
		# They are parallel arrays (ith count corresponds to ith token)
		counts = self.counter_sampler.counts[1:]
		tokens = self.token_map.tokens[1:]

		# Zip them together and sort by counts
		token_counts = zip(counts, tokens)
		token_counts.sort(reverse=True)

		# Separate them again
		new_counts = [unk_count]
		new_tokens = ['UNK']
		for count, token in token_counts:
			new_counts.append(count)
			new_tokens.append(token)

		# Rebuild the token_map and counter_sampler on the sorted arrays
		self.token_map = TokenMap(on_unk=self.on_unk, tokens=new_tokens)
		self.counter_sampler = CounterSampler(counts=new_counts)


	def remove(self, token):
		idx = self.get_id(token)
		self.token_map.remove(token)
		self.counter_sampler.remove(idx)


	def compact(self):
		self.token_map.compact()
		self.counter_sampler.compact()


	def prune(self, min_frequency=5):
		'''
		Remove all tokens that have been observed fewer than min_frequency
		times.  Counts for tokens that are removed are attributed to UNK.
		'''
		counts = []
		tokens = []
		for idx, token in enumerate(self.token_map.tokens):

			# Copy over tokens that have at least min_frequency
			# observations. Also copy over UNK no matter what it's
			# frequency.
			if (
				self.counter_sampler.get_frequency(idx) >= min_frequency
				or idx == 0
			):
				tokens.append(token)
				counts.append(self.get_frequency(idx))

			# Skip tokens that have too little frequency.  Attribute their
			# observations to UNK
			else:
				counts[UNK] += self.get_frequency(idx)

		# Create a new TokenMap and CounterFrequency based on the
		# filtered tokens and their counts
		self.token_map = TokenMap(on_unk=self.on_unk, tokens=tokens)
		self.counter_sampler = CounterSampler(counts=counts)


	def add(self, token):
		'''
		Add a new token.  If this "token type" (which means this specific
		spelling of a word) has not been seen before, add it to the
		mapping.  Also increment the count for that token type.  Return
		its ID under the token mapping.
		'''

		# Get or create an id for this token
		token_id = self.token_map.add(token)

		# Increment the frequency count
		self.counter_sampler.add(token_id)

		return token_id


	def get_vocab_size(self):
		'''
		Return the number of unique tokens in the token_map.
		'''
		return len(self.token_map)


	def get_num_tokens(self):
		'''
		Return the total number of (non-distinct) tokens observed.
		'''
		return len(self.counter_sampler)


	def __len__(self):
		'''
		Same as get_vocab_size().
		Return the number of unique tokens in the token_map.
		'''
		return len(self.token_map)


	def update(self, token_iterable):
		return [self.add(token) for token in token_iterable]


	def get_id(self, token):
		'''
		Get the id (int) for the corresponding token (string).
		'''
		# Delegate to the underlying token_map.
		return self.token_map.get_id(token)


	def get_ids(self, token_iterable):
		'''
		Get the ids (list of ints) for the corresponding tokens (strings)
		issued by token_iterable.
		'''
		# Delegate to the underlying token map.
		return self.token_map.get_ids(token_iterable)


	def get_token(self, idx):
		'''
		Return token (string) for the corresponding id (int)
		'''
		# Delegate to the underlying token map
		return self.token_map.get_token(idx)


	def get_tokens(self, idx_iterable):
		'''
		Return the tokens (list of strings) for the corresponding ids
		(ints) issued by idx_iterable.
		'''
		# Delegate to the underlying token map.
		return self.token_map.get_tokens(idx_iterable)


	def save(self, savedir):
		'''
		Save the UnigramDictionary to the directory specified.  This saves
		the underlying TokenMap and CounterSampler in the directory
		given (savedir), using the default filenames "token-map.gz" and
		"counter-sampler.gz".
		'''

		# If the directory provided is a file, raise an error
		if os.path.exists(savedir):
			if os.path.isfile(savedir):
				raise IOError(
					'Directory specified for saving UnigramDictionary is a '
					'file.'
				)

		# If the directory provided doesn't exist, make it (this will not
		# make parent directories though).
		else:
			os.mkdir(savedir)


		# Save the TokenMap and CounterSampler by delegating to their
		# save functions.
		self.token_map.save(os.path.join(savedir, 'token-map.gz'))
		self.counter_sampler.save(os.path.join(
			savedir, 'counter-sampler.gz'
		))


	def load(self, loaddir):
		'''
		Load a UnigramDictionary from the specified directory, by
		loading the TokenMap and CounterSampler stored there.  This assumes
		the filenames are 'token-map.gz' and 'counter-sampler.gz'.
		'''
		# Load the TokenMap by delegation to its load function
		self.token_map = TokenMap()
		self.token_map.load(os.path.join(loaddir, 'token-map.gz'))

		# Load the CounterSampler by delegation to its load function
		self.counter_sampler = CounterSampler()
		self.counter_sampler.load(
			os.path.join(loaddir, 'counter-sampler.gz'))


	def sample(self, shape=None):
		'''
		Draw a sample according to the counter_sampler probability
		'''
		# Delegate to the underlying CounterSampler
		return self.counter_sampler.sample(shape)


	def get_probability(self, token_id):
		'''
		Return the probability associated to token_id.
		'''
		# Delegate to the underlying CounterSampler
		return self.counter_sampler.get_probability(token_id)


	def get_frequency(self, token_id):
		'''
		Return the frequency associated to token_id.
		'''
		# Delegate to the underlying CounterSampler
		return self.counter_sampler.get_frequency(token_id)