Ejemplos de TrueCase en Python

Lenguaje de programación: Python

Namespace/Package Name: _code.truecase

Clase / Tipo: TrueCase

Ejemplos en hotexamples.com: 2

Python TrueCase - 2 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de _code.truecase.TrueCase extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

truecase(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: markov_chain.py Proyecto: SGShuman/markov_sentence_generator

	def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None):
		self.markov_dict = markov_dict
		self.gtype = self.markov_dict['gtype']
		self.stop_words = set(stopwords.words('english'))
		self.neighbor_dict = neighbor_dict
		self.tokenizer = WhitespaceTokenizer()
		self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt'])
		self.lower_word_list = [w.lower() for w in self.word_list]
		# Count of word freq, maintaining case
		self.word_dict_count = Counter(self.word_list)
		self.truecaser = TrueCase(self.markov_dict['fname'])

		# Create priority and not_found_list if none were entered
		if priority_list:
			self.priority_list = priority_list
		else:
			self._make_priority()
		if not_found_list:
			self.not_found_list = not_found_list
		else:
			self._make_not_found()

Ejemplo n.º 2

Mostrar archivo

Archivo: markov_chain.py Proyecto: SGShuman/markov_sentence_generator

class MarkovChain(object):
	'''Create a MarkovChain from the given dictionary and parameters,
	run() returns a sentence given a seed

	markov_dict should be a MarkovDict().api dictionary'''

	def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None):
		self.markov_dict = markov_dict
		self.gtype = self.markov_dict['gtype']
		self.stop_words = set(stopwords.words('english'))
		self.neighbor_dict = neighbor_dict
		self.tokenizer = WhitespaceTokenizer()
		self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt'])
		self.lower_word_list = [w.lower() for w in self.word_list]
		# Count of word freq, maintaining case
		self.word_dict_count = Counter(self.word_list)
		self.truecaser = TrueCase(self.markov_dict['fname'])

		# Create priority and not_found_list if none were entered
		if priority_list:
			self.priority_list = priority_list
		else:
			self._make_priority()
		if not_found_list:
			self.not_found_list = not_found_list
		else:
			self._make_not_found()

	def _make_priority(self, n=10):
		'''Return the n most common words in the corpus'''
		# Remove stop_words
		content = [w for w in self.lower_word_list if w not in self.stop_words]
		# Remove words that are only punctuation
		content_no_punc = []
		for word in content:
			tmp = False
			for char in word:
				if char not in punctuation:
					tmp = True
				else:
					continue
			if tmp:
				content_no_punc.append(word)

		priority_dict = Counter(content_no_punc)
		self.priority_list = [key for key, val in priority_dict.most_common(n)]

	def _make_not_found(self, n=15):
		'''Return the n most common sentences in the corpus'''
		not_found_dict = Counter(sent_tokenize(self.markov_dict['corpus_txt']))
		common_sent = [key for key, val in not_found_dict.most_common(n)]
		self.not_found_list = []
		# Might fill with small stuff, don't let that happen
		for sent in common_sent:
			if len(sent) > 5:
				self.not_found_list.append(sent)

	def _get_input(self, input_phrase):
		'''Take in the raw input from the user'''
		# Lowercase and remove common punc
		input_phrase = input_phrase.lower()
		input_phrase = re.sub('\?', '', input_phrase)
		input_phrase = re.sub('\.', '', input_phrase)
		input_phrase = re.sub(',', '', input_phrase)
		input_phrase = re.sub('!', '', input_phrase)

		# List of words from a potential input phrase
		word_list = input_phrase.split()

		# Make a list of words that are in priority_list
		priority_words = [w for w in word_list if w in self.priority_list]

		# If no priority words, look for non stop words
		content = [w for w in word_list if w not in self.stop_words]

		# Look for priority words first, content second, and finally random
		if priority_words:
			seed = np.random.choice(priority_words)
		elif content:
			seed = np.random.choice(content)
		else:  # Final option is a random word
		    seed = np.random.choice(word_list)

		# if the words is not in text, find neighbors
		if not self._in_text(seed):
			seed = self._get_neighbor(seed)

		return seed


	def _in_text(self, word):
		'''Return true if word is in the corpus'''
		return word.lower() in set(self.lower_word_list)

	def _get_neighbor(self, seed):
		'''Return the nearest neighbor to seed from a database'''
		if not self.neighbor_dict:
			return None

		neighbors = self.neighbor_dict[seed]

		good_neighbors = []
		for word in neighbors:
			if self._in_text(word):  # Only pick a neighbor if in text
				good_neighbors.append(word)
		if good_neighbors:
			return np.random.choice(good_neighbors)
		else:
			return None

	def _generate_key(self, seed, dir_dict):
		'''Return key from a chosen seed'''
		key_list = []
		for key in dir_dict:
			# Look at the last key_gram_size words in the key
			# First word in that key_gram_size len phrase must match seed
			if seed in key[-self.key_gram_size]:
				key_list.append(key)
		return key_list[np.random.choice(len(key_list))]

	def _run_chain(self, seed, dir_dict):
		'''Return a list of words generated from seed
		Iterate through dictionary until a period or capital is reached'''
		key = self._generate_key(seed, dir_dict)
		text = list(key[-self.key_gram_size:])

		# If not end/begin of sent, run
		while True:
			# Values is a list of lists
			values = dir_dict[key]

			# Choose a value with probability equal to distribution in corpus
			value = values[np.random.choice(len(values))]
			if (() in value) | (value == ()): # End condition
				break

			# Add a value_gram_size phrase to the text
			words_from_value = value[:self.value_gram_size]
			text += words_from_value

			# Create new lookup key
			key = tuple(text[-self.markov_dict['gram_size']:])
		return text

	def _get_sentence(self, seed):
		'''Return a sentence given a seed'''
		f_text = self._run_chain(seed, self.markov_dict['f_dict'])
		b_text = self._run_chain(seed, self.markov_dict['b_dict'])

		# b_text is backwards obviously, so turn it around
		b_text = list(reversed(b_text))

		# Only include seed once
		sent = b_text[:-1] + f_text

		return sent

	def _get_sentence_str(self, sent):
		'''Return a string representation of a list'''
		if self.gtype != 'naive':
			sent = [w[0] for w in sent]
		text = ' '.join(sent)

		punc_w_space = [' ' + x for x in punctuation]
		for i in xrange(len(text)-1):
			if text[i:i+2] in punc_w_space:
				text = text[:i] + text[i+1:]
		return text

	def run(self, input_text, key_gram_size=2, value_gram_size=1):
		'''Return a sentence based on gram_size
		Larger gram_size is more deterministic phrases
		gram_size cannot be larger than gram_size'''
		self.key_gram_size = min(key_gram_size, self.markov_dict['gram_size'])
		self.value_gram_size = min(value_gram_size, self.markov_dict['gram_size'])
		while self.key_gram_size + self.value_gram_size < self.markov_dict['gram_size']:
			self.value_gram_size += 1

		seed = self._get_input(input_text)
		# If seed not in corpus and no neighbor found, return random sent
		if not seed:
			return np.random.choice(self.not_found_list)
		sent = self._get_sentence(seed)

		# Turn into string for output
		sent_str = self._get_sentence_str(sent)

		# Fix space before punc
		output = self.truecaser.truecase(sent_str)
		return output