Exemple #1
0
	def prune(self, min_frequency=5):
		'''
		Remove all tokens that have been observed fewer than min_frequency
		times.  Counts for tokens that are removed are attributed to UNK.
		'''
		counts = []
		tokens = []
		for idx, token in enumerate(self.token_map.tokens):

			# Copy over tokens that have at least min_frequency
			# observations. Also copy over UNK no matter what it's
			# frequency.
			if (
				self.counter_sampler.get_frequency(idx) >= min_frequency
				or idx == 0
			):
				tokens.append(token)
				counts.append(self.get_frequency(idx))

			# Skip tokens that have too little frequency.  Attribute their
			# observations to UNK
			else:
				counts[UNK] += self.get_frequency(idx)

		# Create a new TokenMap and CounterFrequency based on the
		# filtered tokens and their counts
		self.token_map = TokenMap(on_unk=self.on_unk, tokens=tokens)
		self.counter_sampler = CounterSampler(counts=counts)
    def load(self, loaddir):
        '''
		Load a UnigramDictionary from the specified directory, by
		loading the TokenMap and CounterSampler stored there.  This assumes
		the filenames are 'token-map.gz' and 'counter-sampler.gz'.
		'''
        # Load the TokenMap by delegation to its load function
        self.token_map = TokenMap()
        self.token_map.load(os.path.join(loaddir, 'token-map.gz'))

        # Load the CounterSampler by delegation to its load function
        self.counter_sampler = CounterSampler()
        self.counter_sampler.load(os.path.join(loaddir, 'counter-sampler.gz'))
Exemple #3
0
	def __init__(self, on_unk=WARN, token_map=None, counter_sampler=None):
		'''
		Create a new UnigramDictionary.  Typical usage provides no
		arguments, but a token_map and counter_sampler can be provided
		to build a UnigramDictionary that comprises them.
		'''
		self.on_unk = on_unk
		self.token_map = token_map
		if token_map is None:
			self.token_map = TokenMap(on_unk=on_unk)

		self.counter_sampler = counter_sampler
		if counter_sampler is None:
			self.counter_sampler = CounterSampler()
Exemple #4
0
	def sort(self):
		unk_count = self.counter_sampler.counts[0]

		# Get the counts and tokens (skipping the first UNK entry)
		# They are parallel arrays (ith count corresponds to ith token)
		counts = self.counter_sampler.counts[1:]
		tokens = self.token_map.tokens[1:]

		# Zip them together and sort by counts
		token_counts = zip(counts, tokens)
		token_counts.sort(reverse=True)

		# Separate them again
		new_counts = [unk_count]
		new_tokens = ['UNK']
		for count, token in token_counts:
			new_counts.append(count)
			new_tokens.append(token)

		# Rebuild the token_map and counter_sampler on the sorted arrays
		self.token_map = TokenMap(on_unk=self.on_unk, tokens=new_tokens)
		self.counter_sampler = CounterSampler(counts=new_counts)