def main(config): if config.mode == 'train': train_data, valid_data = load_dataset(config.mode, config.random_seed) # if use CNN model, pad sentences to let all the batch inputs has minimum length (filter_sizes[-1]) if config.model == 'cnn': train_data = pad_sentence(train_data, config.filter_sizes[-1]) valid_data = pad_sentence(valid_data, config.filter_sizes[-1]) train_iter, valid_iter, pad_idx = make_iter(config.batch_size, config.mode, train_data=train_data, valid_data=valid_data) trainer = Trainer(config, pad_idx, train_iter=train_iter, valid_iter=valid_iter) trainer.train() else: test_data = load_dataset(config.mode, config.random_seed) if config.model == 'cnn': test_data = pad_sentence(test_data, config.filter_sizes[-1]) test_iter, pad_idx = make_iter(config.batch_size, config.mode, test_data=test_data) trainer = Trainer(config, pad_idx, test_iter=test_iter) trainer.inference()
def get_fd(self, x_batch, y_batch, lr=None, dropout_embed=1, dropout_fc=1): sentences = [list(zip(*x))[1] for x in x_batch] char_sentences = [list(zip(*x))[0] for x in x_batch] sentences_padded, sentence_lengths = pad_sentence(sentences) label_padded, _ = pad_sentence(y_batch) chars_padded, chars_lengths = pad_word(char_sentences) # embedding lookup table, if the embedding is contextual embedding(not in ['glove', 'w2v', 'fasttext']), due to the too big lookup_table size, we need feed it as input variable # or use tf.record if future version if self.config.embedding_type not in ['glove', 'w2v', 'fasttext']: fd = { self.word_ids: sentences_padded, self.sentence_lengths: sentence_lengths, self.char_ids: chars_padded, self.word_lengths: chars_lengths, self.labels: label_padded, self.lr: lr, self.dropout_embed: dropout_embed, self.dropout_fc: dropout_fc, self._word_embeddings_lookup_table: self.config.lookup_table } else: fd = { self.word_ids: sentences_padded, self.sentence_lengths: sentence_lengths, self.char_ids: chars_padded, self.word_lengths: chars_lengths, self.labels: label_padded, self.lr: lr, self.dropout_embed: dropout_embed, self.dropout_fc: dropout_fc # self._word_embeddings_lookup_table: self.config.lookup_table } return fd, sentence_lengths, label_padded, sentences
def get_attention(self, session, sent1, sent2): kp = 1.0 sent1 = utils.encode_sentence(self.vocab, sent1) print(sent1) sent2 = utils.encode_sentence(self.vocab, sent2) print(sent2) sent1 = utils.pad_sentence(self.vocab, sent1, self.config.sent_len, 'post') sent2 = utils.pad_sentence(self.vocab, sent2, self.config.sent_len, 'post') len1, len2 = np.array([len(sent1)]), np.array([len(sent2)]) sent1_arr = np.array(sent1).reshape((1,-1)) sent2_arr = np.array(sent2).reshape((1,-1)) y = np.array([0,1,0]).reshape((1,-1)) feed = self.create_feed_dict(sent1_arr, sent2_arr, len1, len2, y, kp) preds, betas = session.run([self.predictions, self.attention], feed_dict=feed) return preds, betas
def get_attention(self, session, sent1, sent2): kp = 1.0 sent1 = utils.encode_sentence(self.vocab, sent1) print(sent1) sent2 = utils.encode_sentence(self.vocab, sent2) print(sent2) sent1 = utils.pad_sentence(self.vocab, sent1, self.config.sent_len, 'post') sent2 = utils.pad_sentence(self.vocab, sent2, self.config.sent_len, 'post') len1, len2 = np.array([len(sent1)]), np.array([len(sent2)]) sent1_arr = np.array(sent1).reshape((1, -1)) sent2_arr = np.array(sent2).reshape((1, -1)) y = np.array([0, 1, 0]).reshape((1, -1)) feed = self.create_feed_dict(sent1_arr, sent2_arr, len1, len2, y, kp) preds, alphas = session.run([self.predictions, self.attention], feed_dict=feed) return preds, alphas
def to_input_array(self, elements: List): """ Convert list of sentences (sentence = list of words) into tensor with necessary padding for shorter sentences :param sentences: (List[List]) list of sentences :return: array of (batch, max_sentence_length) """ elements_ids = self.elements2indices(elements) if type(elements[0]) == list: elements_ids = pad_sentence(elements_ids, self['<pad>']) return np.array(elements_ids, dtype=int)
def to_input_tensor(self, elements: List, device: torch.device): """ Convert list of sentences (sentence = list of words) into tensor with necessary padding for shorter sentences :param sentences: (List[List]) list of sentences :param device: on which device to return the result :return: tensor of (batch, max_sentence_length) """ elements_ids = self.elements2indices(elements) if type(elements[0]) == list: elements_ids = pad_sentence(elements_ids, self['<pad>']) return torch.tensor(elements_ids, dtype=torch.long, device=device)
limit=words_limit) print("Word2Vec English imported") de_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.de.vec', limit=words_limit) print("Word2Vec German imported") undefined_word_vec = np.ones((300, ), dtype=np.float32) endtoken_vec = np.zeros((300, ), dtype=np.float32) while (True): x = input("Enter english:") de_line_vecs = [ en_model[w] if w in en_model.vocab else undefined_word_vec for w in x.split() ] de_line_vecs = pad_sentence(de_line_vecs, sentence_length_limit, endtoken_vec) predictions = model.predict(np.array([de_line_vecs])) reg_pred = predictions[:, :, :300][0] softmax_pred = predictions[:, :, -1][0] outputlist = [ de_model.most_similar([reg_pred[i]])[0][0] if softmax_pred[i] >= 0.5 else ' ' for i in range(20) ] outputlist2 = [ de_model.most_similar([reg_pred[i]])[0][0] for i in range(20) ] output = ' '.join(outputlist) output2 = ' '.join(outputlist2) print('w/ Classif: %s' % output)