コード例 #1
0
	def sentence_segment(self, paragraph, tri_gram=False):
		
		# preprocess
		words = self.wp.word_segment(paragraph, dict=self.dict_name)
		tmp_paragraph = self.wp.clean_special_characters(words)
		to_be_tagged, new_paragraph, replace_idx = self.clean_unknown_word(tmp_paragraph)

		# call viterbi function to get most possible pos sequence
		initp, trans, emiss = self.corpus.get_statistics_model(tri_gram)

		if tri_gram:
			path = vtb.viterbi_trigram(to_be_tagged, self.corpus.pos_list_sentence, initp, trans, emiss)
		else:
			path = vtb.viterbi(to_be_tagged, self.corpus.pos_list_sentence, initp, trans, emiss)
			# for i in range(len(path)):
			# 	print(to_be_tagged[i] + "\t\t" + path[i])

		# postprocess
		pos = self.invert_unknown_word(new_paragraph, path, replace_idx)
		sentences, sen_with_pos = self.cut_sentence(words, pos)
		merge_sen, merge_sen_with_pos = self.merge_sentence(sentences, sen_with_pos)

		# return sentences, sen_with_pos
		# return merge_sen, merge_sen_with_pos
		# return [sentence.sentence(sentences[i], sen_with_pos[i]) for i in range(len(sentences))]
		return [sentence.sentence(merge_sen[i], merge_sen_with_pos[i]) for i in range(len(merge_sen))]
コード例 #2
0
def get_question_item(question_file, pos_file):
	sentences = []
	sentence_with_pos = []
	with open(pos_file) as f:
		for line in f:
			read_list = ast.literal_eval(line.strip())
			for a_sentence in read_list:
				sentences.append("".join([word for (word, _) in a_sentence]))
				sentence_with_pos.append(a_sentence)

	sentence_count = 0
	all_question_items = []
	with open(question_file) as f:
		for line in f:
			read_list = ast.literal_eval(line.strip())
			for a_sentence in read_list:
				sentence_item = sentence.sentence(sentences[sentence_count], sentence_with_pos[sentence_count])
				for a_question in a_sentence:
					(question_sentence, answer, choices) = a_question
					answer_item = word_item.word_item(answer)
					answer_index = find_blank_index([tp[0] for tp in sentence_with_pos[sentence_count]], question_sentence)
					all_generated_choices = _cg.choice_generate(answer_item)
					choice_items = []
					for a_choice in choices:
						if str(a_choice) == answer:
							choice_items.append(answer_item)
						else:
							for gen_choice in all_generated_choices:
								if str(gen_choice) == a_choice:
									choice_items.append(gen_choice)
									break

					# print([str(choice) for choice in choice_items])
					question = question_item.question_item(sentence_item, sentence_count, question_sentence, answer_item, answer_index)
					question.add_choices(choice_items)
					all_question_items.append(question)

				sentence_count += 1

	return all_question_items
コード例 #3
0
	def __init__(self, *args, **kwargs):
		if len(args) == 5:
			(sentence, sentence_no, question, answer, answer_index) = args
			self.sentence = sentence
			self.sentence_no = sentence_no
			self.question = question
			self.answer = answer
			self.answer_index = answer_index
			self.choices = None
			self.asked_choices = None
		elif "from_str" in kwargs:
			attributes = ast.literal_eval(kwargs["from_str"])
			for key in attributes:
				if key == "sentence":
					self.sentence = _sentence.sentence(from_str=attributes["sentence"])
				elif key == "choices":
					self.choices = [_word_item.word_item(from_str=choice_str) for choice_str in attributes["choices"]]
				elif key == "answer":
					self.answer = _word_item.word_item(from_str=attributes["answer"])
				else:
					setattr(self, key, attributes[key])

		self.evals = []