Esempio n. 1
0
 def save_data(self, preprocess, qid, q1, q2, label):
     if preprocess:
         q1 = preprocess_sentence(q1, preprocess)
         q2 = preprocess_sentence(q2, preprocess)
     # This is a non-duplicate sentence -> non similar
     if label == '0':
         self._non_sim_data.append(Data(qid, q1, q2, label, [0, 1]))
     # This is a duplicate sentence -> similar
     else:
         self._sim_data.append(Data(qid, q1, q2, label, [1, 0]))
Esempio n. 2
0
def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp,
             max_length_targ):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = utils.preprocess_sentence(sentence)
    inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)
        # storing the attention weigths to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.idx2word[predicted_id] + ' '
        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot
Esempio n. 3
0
 def parse_file(self, file, language="english"):
     with open(file, "r") as input_file:
         for line in tqdm(input_file.readlines()):
             tockens = preprocess_sentence(line[:-1], language=language)
             self.word_counter.update(tockens)
     
     for pair in self.word_counter.most_common():
         self.index_word(pair[0])
Esempio n. 4
0
def sentence_to_indexes(sentence: str, vocab: dict) -> List[int]:
    tokens = preprocess_sentence(sentence)
    if len(tokens) == 0:
        return [vocab['UNKNOWN_TOKEN']]
    indexes = []
    for token in tokens:

        if token in vocab:
            indexes.append(vocab[token])
        else:
            indexes.append(vocab['UNKNOWN_TOKEN'])

    return indexes
Esempio n. 5
0
def create_gold_markable_list(doc_obj, input_file, key_file):
    ifp = open(input_file)

    line_num = 0
    for line in ifp:
        line = line.strip('\n')
        sent_tag_unrem = line
        sent_tag_rem = utils.preprocess_sentence(line)
        #extract_markables_from_input_file (doc_obj, line_num, sent_tag_unrem, sent_tag_rem)
        spacy_extract_markables_from_input_file(doc_obj, line_num,
                                                sent_tag_unrem, sent_tag_rem)
        line_num += 1
    ifp.close()

    if (key_file == None):
        return

    kfp = open(key_file)
    #handle_key_file (doc_obj, kfp)
    spacy_handle_key_file(doc_obj, kfp)
    kfp.close()
Esempio n. 6
0
    def evaluate(sentence):
        attention_plot = np.zeros((max_length_targ, max_length_input))

        sentence = preprocess_sentence(sentence)

        inputs = [input_tokenizer.word_index[i] for i in sentence.split(' ')]
        inputs = tf.keras.preprocessing.sequence.pad_sequences(
            [inputs], maxlen=max_length_input, padding='post')
        inputs = tf.convert_to_tensor(inputs)

        result = ''

        hidden = [tf.zeros((1, 256))]
        enc_out, enc_hidden = encoder(inputs, hidden)

        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_tokenizer.word_index['<start>']], 0)

        for t in range(max_length_targ):
            predictions, dec_hidden, attention_weights = decoder(
                dec_input, dec_hidden, enc_out)

            # 存储注意力权重以便后面制图
            attention_weights = tf.reshape(attention_weights, (-1, ))
            attention_plot[t] = attention_weights.numpy()

            predicted_id = tf.argmax(predictions[0]).numpy()

            result += targ_tokenizer.index_word[predicted_id] + ' '

            if targ_tokenizer.index_word[predicted_id] == '<end>':
                return result, sentence, attention_plot

            # 预测的 ID 被输送回模型
            dec_input = tf.expand_dims([predicted_id], 0)

        return result, sentence, attention_plot