コード例 #1
0
def main(config):
    if config.mode == 'train':
        train_data, valid_data = load_dataset(config.mode, config.random_seed)

        # if use CNN model, pad sentences to let all the batch inputs has minimum length (filter_sizes[-1])
        if config.model == 'cnn':
            train_data = pad_sentence(train_data, config.filter_sizes[-1])
            valid_data = pad_sentence(valid_data, config.filter_sizes[-1])

        train_iter, valid_iter, pad_idx = make_iter(config.batch_size,
                                                    config.mode,
                                                    train_data=train_data,
                                                    valid_data=valid_data)

        trainer = Trainer(config,
                          pad_idx,
                          train_iter=train_iter,
                          valid_iter=valid_iter)

        trainer.train()

    else:
        test_data = load_dataset(config.mode, config.random_seed)
        if config.model == 'cnn':
            test_data = pad_sentence(test_data, config.filter_sizes[-1])
        test_iter, pad_idx = make_iter(config.batch_size,
                                       config.mode,
                                       test_data=test_data)
        trainer = Trainer(config, pad_idx, test_iter=test_iter)

        trainer.inference()
コード例 #2
0
ファイル: model.py プロジェクト: dlsanf2000/CRF-LSTM-NER
    def get_fd(self, x_batch, y_batch, lr=None, dropout_embed=1, dropout_fc=1):
        sentences = [list(zip(*x))[1] for x in x_batch]
        char_sentences = [list(zip(*x))[0] for x in x_batch]

        sentences_padded, sentence_lengths = pad_sentence(sentences)
        label_padded, _ = pad_sentence(y_batch)
        chars_padded, chars_lengths = pad_word(char_sentences)

        # embedding lookup table, if the embedding is contextual embedding(not in ['glove', 'w2v', 'fasttext']), due to the too big lookup_table size, we need feed it as input variable
        # or use tf.record if future version
        if self.config.embedding_type not in ['glove', 'w2v', 'fasttext']:
            fd = {
                self.word_ids: sentences_padded,
                self.sentence_lengths: sentence_lengths,
                self.char_ids: chars_padded,
                self.word_lengths: chars_lengths,
                self.labels: label_padded,
                self.lr: lr,
                self.dropout_embed: dropout_embed,
                self.dropout_fc: dropout_fc,
                self._word_embeddings_lookup_table: self.config.lookup_table
            }
        else:
            fd = {
                self.word_ids: sentences_padded,
                self.sentence_lengths: sentence_lengths,
                self.char_ids: chars_padded,
                self.word_lengths: chars_lengths,
                self.labels: label_padded,
                self.lr: lr,
                self.dropout_embed: dropout_embed,
                self.dropout_fc: dropout_fc
                # self._word_embeddings_lookup_table: self.config.lookup_table
            }
        return fd, sentence_lengths, label_padded, sentences
 def get_attention(self, session, sent1, sent2):
     kp = 1.0
     sent1 = utils.encode_sentence(self.vocab, sent1)
     print(sent1)
     sent2 = utils.encode_sentence(self.vocab, sent2)
     print(sent2)
     sent1 = utils.pad_sentence(self.vocab, sent1, self.config.sent_len,
             'post')
     sent2 = utils.pad_sentence(self.vocab, sent2, self.config.sent_len,
             'post')
     len1, len2 = np.array([len(sent1)]), np.array([len(sent2)])
     sent1_arr = np.array(sent1).reshape((1,-1))
     sent2_arr = np.array(sent2).reshape((1,-1))
     y = np.array([0,1,0]).reshape((1,-1))
     feed = self.create_feed_dict(sent1_arr, sent2_arr, len1, len2, y, kp)
     preds, betas = session.run([self.predictions, self.attention], feed_dict=feed)
     return preds, betas
コード例 #4
0
 def get_attention(self, session, sent1, sent2):
     kp = 1.0
     sent1 = utils.encode_sentence(self.vocab, sent1)
     print(sent1)
     sent2 = utils.encode_sentence(self.vocab, sent2)
     print(sent2)
     sent1 = utils.pad_sentence(self.vocab, sent1, self.config.sent_len,
                                'post')
     sent2 = utils.pad_sentence(self.vocab, sent2, self.config.sent_len,
                                'post')
     len1, len2 = np.array([len(sent1)]), np.array([len(sent2)])
     sent1_arr = np.array(sent1).reshape((1, -1))
     sent2_arr = np.array(sent2).reshape((1, -1))
     y = np.array([0, 1, 0]).reshape((1, -1))
     feed = self.create_feed_dict(sent1_arr, sent2_arr, len1, len2, y, kp)
     preds, alphas = session.run([self.predictions, self.attention],
                                 feed_dict=feed)
     return preds, alphas
コード例 #5
0
 def to_input_array(self, elements: List):
     """
     Convert list of sentences (sentence = list of words) into tensor with necessary padding for shorter sentences
     :param sentences: (List[List]) list of sentences
     :return: array of (batch, max_sentence_length)
     """
     elements_ids = self.elements2indices(elements)
     if type(elements[0]) == list:
         elements_ids = pad_sentence(elements_ids, self['<pad>'])
     return np.array(elements_ids, dtype=int)
コード例 #6
0
 def to_input_tensor(self, elements: List, device: torch.device):
     """
     Convert list of sentences (sentence = list of words) into tensor with necessary padding for shorter sentences
     :param sentences: (List[List]) list of sentences
     :param device: on which device to return the result
     :return: tensor of (batch, max_sentence_length)
     """
     elements_ids = self.elements2indices(elements)
     if type(elements[0]) == list:
         elements_ids = pad_sentence(elements_ids, self['<pad>'])
     return torch.tensor(elements_ids, dtype=torch.long, device=device)
コード例 #7
0
                                                           limit=words_limit)
print("Word2Vec English imported")
de_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.de.vec',
                                                           limit=words_limit)
print("Word2Vec German imported")

undefined_word_vec = np.ones((300, ), dtype=np.float32)
endtoken_vec = np.zeros((300, ), dtype=np.float32)

while (True):
    x = input("Enter english:")
    de_line_vecs = [
        en_model[w] if w in en_model.vocab else undefined_word_vec
        for w in x.split()
    ]
    de_line_vecs = pad_sentence(de_line_vecs, sentence_length_limit,
                                endtoken_vec)

    predictions = model.predict(np.array([de_line_vecs]))
    reg_pred = predictions[:, :, :300][0]
    softmax_pred = predictions[:, :, -1][0]
    outputlist = [
        de_model.most_similar([reg_pred[i]])[0][0]
        if softmax_pred[i] >= 0.5 else ' ' for i in range(20)
    ]
    outputlist2 = [
        de_model.most_similar([reg_pred[i]])[0][0] for i in range(20)
    ]

    output = ' '.join(outputlist)
    output2 = ' '.join(outputlist2)
    print('w/ Classif: %s' % output)