def sentence_segment(self, paragraph, tri_gram=False): # preprocess words = self.wp.word_segment(paragraph, dict=self.dict_name) tmp_paragraph = self.wp.clean_special_characters(words) to_be_tagged, new_paragraph, replace_idx = self.clean_unknown_word(tmp_paragraph) # call viterbi function to get most possible pos sequence initp, trans, emiss = self.corpus.get_statistics_model(tri_gram) if tri_gram: path = vtb.viterbi_trigram(to_be_tagged, self.corpus.pos_list_sentence, initp, trans, emiss) else: path = vtb.viterbi(to_be_tagged, self.corpus.pos_list_sentence, initp, trans, emiss) # for i in range(len(path)): # print(to_be_tagged[i] + "\t\t" + path[i]) # postprocess pos = self.invert_unknown_word(new_paragraph, path, replace_idx) sentences, sen_with_pos = self.cut_sentence(words, pos) merge_sen, merge_sen_with_pos = self.merge_sentence(sentences, sen_with_pos) # return sentences, sen_with_pos # return merge_sen, merge_sen_with_pos # return [sentence.sentence(sentences[i], sen_with_pos[i]) for i in range(len(sentences))] return [sentence.sentence(merge_sen[i], merge_sen_with_pos[i]) for i in range(len(merge_sen))]
def get_question_item(question_file, pos_file): sentences = [] sentence_with_pos = [] with open(pos_file) as f: for line in f: read_list = ast.literal_eval(line.strip()) for a_sentence in read_list: sentences.append("".join([word for (word, _) in a_sentence])) sentence_with_pos.append(a_sentence) sentence_count = 0 all_question_items = [] with open(question_file) as f: for line in f: read_list = ast.literal_eval(line.strip()) for a_sentence in read_list: sentence_item = sentence.sentence(sentences[sentence_count], sentence_with_pos[sentence_count]) for a_question in a_sentence: (question_sentence, answer, choices) = a_question answer_item = word_item.word_item(answer) answer_index = find_blank_index([tp[0] for tp in sentence_with_pos[sentence_count]], question_sentence) all_generated_choices = _cg.choice_generate(answer_item) choice_items = [] for a_choice in choices: if str(a_choice) == answer: choice_items.append(answer_item) else: for gen_choice in all_generated_choices: if str(gen_choice) == a_choice: choice_items.append(gen_choice) break # print([str(choice) for choice in choice_items]) question = question_item.question_item(sentence_item, sentence_count, question_sentence, answer_item, answer_index) question.add_choices(choice_items) all_question_items.append(question) sentence_count += 1 return all_question_items
def __init__(self, *args, **kwargs): if len(args) == 5: (sentence, sentence_no, question, answer, answer_index) = args self.sentence = sentence self.sentence_no = sentence_no self.question = question self.answer = answer self.answer_index = answer_index self.choices = None self.asked_choices = None elif "from_str" in kwargs: attributes = ast.literal_eval(kwargs["from_str"]) for key in attributes: if key == "sentence": self.sentence = _sentence.sentence(from_str=attributes["sentence"]) elif key == "choices": self.choices = [_word_item.word_item(from_str=choice_str) for choice_str in attributes["choices"]] elif key == "answer": self.answer = _word_item.word_item(from_str=attributes["answer"]) else: setattr(self, key, attributes[key]) self.evals = []