def prepro(self, context, question): context = context.replace("''", '" ').replace("``", '" ') context_tokens = word_tokenize(context) context_chars = [list(token) for token in context_tokens] spans = convert_idx(context, context_tokens) ques = question.replace("''", '" ').replace("``", '" ') ques_tokens = word_tokenize(ques) ques_chars = [list(token) for token in ques_tokens] context_idxs = np.zeros([1, len(context_tokens)], dtype=np.int32) context_char_idxs = np.zeros([1, len(context_tokens), char_limit], dtype=np.int32) ques_idxs = np.zeros([1, len(ques_tokens)], dtype=np.int32) ques_char_idxs = np.zeros([1, len(ques_tokens), char_limit], dtype=np.int32) def _get_word(word): for each in (word, word.lower(), word.capitalize(), word.upper()): if each in self.word2idx_dict: return self.word2idx_dict[each] return 1 def _get_char(char): if char in self.char2idx_dict: return self.char2idx_dict[char] return 1 for i, token in enumerate(context_tokens): context_idxs[0, i] = _get_word(token) for i, token in enumerate(ques_tokens): ques_idxs[0, i] = _get_word(token) for i, token in enumerate(context_chars): for j, char in enumerate(token): if j == char_limit: break context_char_idxs[0, i, j] = _get_char(char) for i, token in enumerate(ques_chars): for j, char in enumerate(token): if j == char_limit: break ques_char_idxs[0, i, j] = _get_char(char) print('ques_idxs:', ques_idxs) print('ques_char_idxs:', ques_char_idxs) return spans, context_idxs, ques_idxs, context_char_idxs, ques_char_idxs
def demo_backend(self, model, config, run_event): global query, response with open(config.word_dictionary, "r") as fh: word_dictionary = json.load(fh) with open(config.char_dictionary, "r") as fh: char_dictionary = json.load(fh) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with model.graph.as_default(): with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) if config.decay < 1.0: sess.run(model.assign_vars) while run_event.is_set(): sleep(0.1) if query: context = word_tokenize(query[0].replace("''", '" ').replace("``", '" ')) c,ch,q,qh,co,qo = convert_to_features(config, query, word_dictionary, char_dictionary, Batcher(config.elmo_vocab_file, config.cont_char_limit)) fd = {'context:0': [c], 'question:0': [q], 'context_char:0': [ch], 'question_char:0': [qh], 'context_cont:0': [co], 'question_cont:0': [qo]} yp1,yp2 = sess.run([model.yp1, model.yp2], feed_dict = fd) yp2[0] += 1 response = " ".join(context[yp1[0]:yp2[0]]) query = []
def demo_backend(self, model, config, run_event): global query, response with open(config.word_dictionary, "r") as fh: word_dictionary = json.load(fh) with open(config.char_dictionary, "r") as fh: char_dictionary = json.load(fh) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with model.graph.as_default(): with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) if config.decay < 1.0: sess.run(model.assign_vars) while run_event.is_set(): sleep(0.1) if query: context = word_tokenize(query[0].replace("''", '" ').replace("``", '" ')) c,ch,q,qh = convert_to_features(config, query, word_dictionary, char_dictionary) fd = {'context:0': [c], 'question:0': [q], 'context_char:0': [ch], 'question_char:0': [qh]} yp1,yp2 = sess.run([model.yp1, model.yp2], feed_dict = fd) yp2[0] += 1 response = " ".join(context[yp1[0]:yp2[0]]) query = []
def prepro(self, contexts): num = len(contexts) context_tokens = [] for text in contexts: text = to_unicode(text) lst = re.split(r",|\?|!|。|,|?|!", text) tokens = [] for x in lst: para_tokens = word_tokenize(x) tokens.append(para_tokens) context_tokens.append(tokens) context_idxs = np.zeros([self.batch_size, self.para_max_num, self.para_max_length], dtype=np.int32) def _get_word(each): if each in self.word2idx_dict: return self.word2idx_dict[each] return 1 for b, context_token in enumerate(context_tokens): for i, tokens in enumerate(context_token): if i < para_max_num: for j, token in enumerate(tokens): if j < para_max_length: context_idxs[b, i,j] = _get_word(token) return context_idxs
def compute_bleu(candidate, reference): max_count = 0 candidate_token = word_tokenize(candidate)[:-1] count_array = [] filtered_sentence = [w for w in candidate_token if not w in stop_words] count = 0 reference_token = word_tokenize(reference) for token in filtered_sentence: if token in reference_token: count += 1 a = count b = len(filtered_sentence) if (b==0): bleu = 0 else: bleu = a/b return bleu
def demo_backend(self, model, config, run_event): global query, response with open(config.word_dictionary, "r") as fh: word_dictionary = json.load(fh) with open(config.char_dictionary, "r") as fh: char_dictionary = json.load(fh) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with model.graph.as_default(): with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(config.save_dir)) if config.decay < 1.0: sess.run(model.assign_vars) while run_event.is_set(): sleep(0.1) if query: context = word_tokenize(query[0].replace( "''", '" ').replace("``", '" ')) c, ch, q, qh = convert_to_features( config, query, word_dictionary, char_dictionary) fd = { 'context:0': [c], 'question:0': [q], 'context_char:0': [ch], 'question_char:0': [qh] } yp1, yp2, logits1, logits2 = sess.run([ model.yp1, model.yp2, model.logits1, model.logits2 ], feed_dict=fd) log_prob1 = logits1[0][yp1[0]] log_prob2 = logits2[0][yp2[0]] score1 = 1 / (1 + math.exp(-log_prob1)) score2 = 1 / (1 + math.exp(-log_prob2)) print("SV Confidence: ", score1, "EV Confidence: ", score2) yp2[0] += 1 response = " ".join(context[yp1[0]:yp2[0]]) query = []
def prepro(self, context, question): context = context.replace("''", '" ').replace("``", '" ') context = pre_proc(context) context_tokens, context_tags, context_ents, context_lemmas = word_tokenize( context) #text, tag, ent, lemma context_lower_tokens = [w.lower() for w in context_tokens] context_chars = [list(token) for token in context_tokens] spans = convert_idx(context, context_tokens) counter_ = Counter(context_lower_tokens) tf_total = len(context_lower_tokens) context_tf = [ float(counter_[w]) / float(tf_total) for w in context_lower_tokens ] ques = question.replace("''", '" ').replace("``", '" ') ques = pre_proc(ques) ques_tokens, ques_tags, ques_ents, ques_lemmas = word_tokenize(ques) ques_lower_tokens = [w.lower() for w in ques_tokens] ques_chars = [list(token) for token in ques_tokens] ques_lemma = { lemma if lemma != '-PRON-' else lower for lemma, lower in zip(ques_lemmas, ques_lower_tokens) } ques_tokens_set = set(ques_tokens) ques_lower_tokens_set = set(ques_lower_tokens) match_origin = [w in ques_tokens_set for w in context_tokens] match_lower = [ w in ques_lower_tokens_set for w in context_lower_tokens ] match_lemma = [ (c_lemma if c_lemma != '-PRON-' else c_lower) in ques_lemma for (c_lemma, c_lower) in zip(context_lemmas, context_lower_tokens) ] example = { "context_tokens": context_tokens, "context_chars": context_chars, "match_origin": match_origin, "match_lower": match_lower, "match_lemma": match_lemma, "context_pos": context_tags, "context_ner": context_ents, "context_tf": context_tf, "ques_tokens": ques_tokens, "ques_pos": ques_tags, "ques_ner": ques_ents, "ques_chars": ques_chars } context_idxs = np.zeros([self.para_limit], dtype=np.int32) context_elmo_tokens = example['context_tokens'] match_origin = np.zeros([self.para_limit], dtype=np.int32) match_lower = np.zeros([self.para_limit], dtype=np.int32) match_lemma = np.zeros([self.para_limit], dtype=np.int32) context_tf = np.zeros([self.para_limit], dtype=np.float32) context_pos_idxs = np.zeros([self.para_limit], dtype=np.int32) context_ner_idxs = np.zeros([self.para_limit], dtype=np.int32) context_char_idxs = np.zeros([self.para_limit, self.char_limit], dtype=np.int32) ques_idxs = np.zeros([self.ques_limit], dtype=np.int32) ques_elmo_tokens = example['ques_tokens'] ques_pos_idxs = np.zeros([self.ques_limit], dtype=np.int32) ques_ner_idxs = np.zeros([self.ques_limit], dtype=np.int32) ques_char_idxs = np.zeros([self.ques_limit, self.char_limit], dtype=np.int32) def _get_word(word): for each in (word, word.lower(), word.capitalize(), word.upper()): if each in self.word2idx_dict: return self.word2idx_dict[each] return 1 def _get_pos(pos): if pos in self.pos2idx_dict: return self.pos2idx_dict[pos] return 1 def _get_ner(ner): if ner in self.ner2idx_dict: return self.ner2idx_dict[ner] return 1 def _get_char(char): if char in self.char2idx_dict: return self.char2idx_dict[char] return 1 for i, token in enumerate(example["context_tokens"]): context_idxs[i] = _get_word(token) for i, match in enumerate(example["match_origin"]): match_origin[i] = 1 if match == True else 0 for i, match in enumerate(example["match_lower"]): match_lower[i] = 1 if match == True else 0 for i, match in enumerate(example["match_lemma"]): match_lemma[i] = 1 if match == True else 0 for i, tf in enumerate(example['context_tf']): context_tf[i] = tf for i, pos in enumerate(example['context_pos']): context_pos_idxs[i] = _get_pos(pos) for i, ner in enumerate(example['context_ner']): context_ner_idxs[i] = _get_ner(ner) for i, token in enumerate(example["ques_tokens"]): ques_idxs[i] = _get_word(token) for i, pos in enumerate(example['ques_pos']): ques_pos_idxs[i] = _get_pos(pos) for i, ner in enumerate(example['ques_ner']): ques_ner_idxs[i] = _get_ner(ner) for i, token in enumerate(example["context_chars"]): for j, char in enumerate(token): if j == self.char_limit: break context_char_idxs[i, j] = _get_char(char) for i, token in enumerate(example["ques_chars"]): for j, char in enumerate(token): if j == self.char_limit: break ques_char_idxs[i, j] = _get_char(char) passage_ids = torch.LongTensor([context_idxs.tolist()]).to(self.device) passage_char_ids = torch.LongTensor([context_char_idxs.tolist() ]).to(self.device) passage_pos_ids = torch.LongTensor([context_pos_idxs.tolist() ]).to(self.device) passage_ner_ids = torch.LongTensor([context_ner_idxs.tolist() ]).to(self.device) passage_match_origin = torch.FloatTensor([match_origin.tolist() ]).to(self.device) passage_match_lower = torch.FloatTensor([match_lower.tolist() ]).to(self.device) passage_match_lemma = torch.FloatTensor([match_lemma.tolist() ]).to(self.device) passage_tf = torch.FloatTensor([context_tf.tolist()]).to(self.device) ques_ids = torch.LongTensor([ques_idxs.tolist()]).to(self.device) ques_char_ids = torch.LongTensor([ques_char_idxs.tolist() ]).to(self.device) ques_pos_ids = torch.LongTensor([ques_pos_idxs.tolist() ]).to(self.device) ques_ner_ids = torch.LongTensor([ques_ner_idxs.tolist() ]).to(self.device) passage_elmo_ids = batch_to_ids([context_elmo_tokens]).to(self.device) question_elmo_ids = batch_to_ids([ques_elmo_tokens]).to(self.device) p_lengths = passage_ids.ne(0).long().sum(1) q_lengths = ques_ids.ne(0).long().sum(1) passage_maxlen = int(torch.max(p_lengths, 0)[0]) ques_maxlen = int(torch.max(q_lengths, 0)[0]) passage_ids = passage_ids[:, :passage_maxlen] passage_char_ids = passage_char_ids[:, :passage_maxlen] passage_pos_ids = passage_pos_ids[:, :passage_maxlen] passage_ner_ids = passage_ner_ids[:, :passage_maxlen] passage_match_origin = passage_match_origin[:, :passage_maxlen] passage_match_lower = passage_match_lower[:, :passage_maxlen] passage_match_lemma = passage_match_lemma[:, :passage_maxlen] passage_tf = passage_tf[:, :passage_maxlen] ques_ids = ques_ids[:, :ques_maxlen] ques_char_ids = ques_char_ids[:, :ques_maxlen] ques_pos_ids = ques_pos_ids[:, :ques_maxlen] ques_ner_ids = ques_ner_ids[:, :ques_maxlen] p_mask = self.compute_mask(passage_ids) q_mask = self.compute_mask(ques_ids) return (passage_ids, passage_char_ids, passage_pos_ids, passage_ner_ids, passage_match_origin.unsqueeze(2).float(), passage_match_lower.unsqueeze(2).float(), passage_match_lemma.unsqueeze(2).float(), passage_tf.unsqueeze(2), p_mask, ques_ids, ques_char_ids, ques_pos_ids, ques_ner_ids, q_mask, passage_elmo_ids, question_elmo_ids), spans