def save_data(self, preprocess, qid, q1, q2, label): if preprocess: q1 = preprocess_sentence(q1, preprocess) q2 = preprocess_sentence(q2, preprocess) # This is a non-duplicate sentence -> non similar if label == '0': self._non_sim_data.append(Data(qid, q1, q2, label, [0, 1])) # This is a duplicate sentence -> similar else: self._sim_data.append(Data(qid, q1, q2, label, [1, 0]))
def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ): attention_plot = np.zeros((max_length_targ, max_length_inp)) sentence = utils.preprocess_sentence(sentence) inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=max_length_inp, padding='post') inputs = tf.convert_to_tensor(inputs) result = '' hidden = [tf.zeros((1, units))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0) for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder( dec_input, dec_hidden, enc_out) # storing the attention weigths to plot later on attention_weights = tf.reshape(attention_weights, (-1, )) attention_plot[t] = attention_weights.numpy() predicted_id = tf.argmax(predictions[0]).numpy() result += targ_lang.idx2word[predicted_id] + ' ' if targ_lang.idx2word[predicted_id] == '<end>': return result, sentence, attention_plot # the predicted ID is fed back into the model dec_input = tf.expand_dims([predicted_id], 0) return result, sentence, attention_plot
def parse_file(self, file, language="english"): with open(file, "r") as input_file: for line in tqdm(input_file.readlines()): tockens = preprocess_sentence(line[:-1], language=language) self.word_counter.update(tockens) for pair in self.word_counter.most_common(): self.index_word(pair[0])
def sentence_to_indexes(sentence: str, vocab: dict) -> List[int]: tokens = preprocess_sentence(sentence) if len(tokens) == 0: return [vocab['UNKNOWN_TOKEN']] indexes = [] for token in tokens: if token in vocab: indexes.append(vocab[token]) else: indexes.append(vocab['UNKNOWN_TOKEN']) return indexes
def create_gold_markable_list(doc_obj, input_file, key_file): ifp = open(input_file) line_num = 0 for line in ifp: line = line.strip('\n') sent_tag_unrem = line sent_tag_rem = utils.preprocess_sentence(line) #extract_markables_from_input_file (doc_obj, line_num, sent_tag_unrem, sent_tag_rem) spacy_extract_markables_from_input_file(doc_obj, line_num, sent_tag_unrem, sent_tag_rem) line_num += 1 ifp.close() if (key_file == None): return kfp = open(key_file) #handle_key_file (doc_obj, kfp) spacy_handle_key_file(doc_obj, kfp) kfp.close()
def evaluate(sentence): attention_plot = np.zeros((max_length_targ, max_length_input)) sentence = preprocess_sentence(sentence) inputs = [input_tokenizer.word_index[i] for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=max_length_input, padding='post') inputs = tf.convert_to_tensor(inputs) result = '' hidden = [tf.zeros((1, 256))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_tokenizer.word_index['<start>']], 0) for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder( dec_input, dec_hidden, enc_out) # 存储注意力权重以便后面制图 attention_weights = tf.reshape(attention_weights, (-1, )) attention_plot[t] = attention_weights.numpy() predicted_id = tf.argmax(predictions[0]).numpy() result += targ_tokenizer.index_word[predicted_id] + ' ' if targ_tokenizer.index_word[predicted_id] == '<end>': return result, sentence, attention_plot # 预测的 ID 被输送回模型 dec_input = tf.expand_dims([predicted_id], 0) return result, sentence, attention_plot