def transform(self, data, return_mask=None): tensor = word_to_vec(data, language='en_elmo') for sent in tensor: print(np.array(sent, dtype='float32').shape) tensor = np.array(tensor, dtype='float32') print(tensor.shape) if self.concat == 'mean': tensor = np.mean(tensor, axis=1) elif self.concat == 'max': tensor = np.max(tensor, axis=1) return tensor
def neg_log_likelihood(self, sent_batch, tags): word_embeds = to_gpu( torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]])) word_embeds = self.emb_dropout(word_embeds) char_embeds = self.word_encoder(sent_batch[0]) sentence_in = torch.cat((word_embeds, char_embeds), dim=-1).unsqueeze(1) sentence_in = self.dropout(sentence_in) feats = self._get_lstm_features(sentence_in) forward_score = self._forward_alg(feats) gold_score = self._score_sentence(feats, tags[0]) return feats, forward_score - gold_score
def forward(self, sent_batch): # dont confuse this with _forward_alg above. word_embeds = to_gpu( torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]])) word_embeds = self.emb_dropout(word_embeds) char_embeds = self.word_encoder(sent_batch[0]) sentence_in = torch.cat((word_embeds, char_embeds), dim=-1).unsqueeze(1) # Get the emission scores from the BiLSTM lstm_feats = self._get_lstm_features(sentence_in) # Find the best path, given the features. score, tag_seq = self._viterbi_decode(lstm_feats) return score, tag_seq, sent_batch[0]
def transform_input(input_list: List[str], model=None) -> np.array: X = None if not torch.is_tensor( X) and model is not None and model._featurizer is not None: X = model._featurizer.transform(input_list) X = model.preprocess_input(X) X = X.numpy() else: # falls back to BoW raw_tokens = [word_tokenize(sent) for sent in input_list] sent_vectors = word_to_vec(raw_tokens) sent_matrix = np.zeros((len(raw_tokens), get_dim())) for ix in range(len(input_list)): print(sent_vectors[ix]) sent_matrix[ix] = np.mean(np.array(sent_vectors[ix], dtype='float32'), axis=0) X = sent_matrix return X
def preprocess_input(self, X): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in X] tokens = self.tokenizer.texts_to_sequences(tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(tokens): for j, token in enumerate(seq): if token < self.tokenizer.num_words: tfidf_weights[i][j] = tfidf_matrix[i][token] # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float()
def forward(self, sent_batch: List[List[str]]): max_length = min(max([len(sent) for sent in sent_batch]), self.config.max_position_embeddings) words_embeddings = to_gpu( torch.FloatTensor(word_to_vec(sent_batch, pad_to_length=max_length))) chars_embeddings = to_gpu( torch.stack([ torch.cat((self.char_encoder(sent), torch.zeros(max_length - len(sent), self.char_embedding_dim)), dim=0) if len(sent) < max_length else self.char_encoder(sent)[:max_length] if len(sent) > max_length else self.char_encoder(sent) for sent in sent_batch ], 0)) if self.use_position_embeddings: position_ids = torch.arange(max_length, dtype=torch.long, device=words_embeddings.device) position_ids = position_ids.unsqueeze(0).expand( words_embeddings.size(0), words_embeddings.size(1)) position_embeddings = self.position_embeddings(position_ids) embeddings = torch.cat([words_embeddings, chars_embeddings], dim=-1) + position_embeddings if self.use_position_embeddings: embeddings = words_embeddings + position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings
def transform(self, data): raw_tokens = [self.tokenize_fn(sent) for sent in data] tokens = self.tokenizer.texts_to_sequences(raw_tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(raw_tokens): for j, raw_token in enumerate(seq): token = -1 if raw_token in self.tokenizer.word_index: token = self.tokenizer.word_index[raw_token] # else: # similar_to_raw_token = most_similar(raw_token) # for similar_word in similar_to_raw_token: # print(similar_to_raw_token) # if similar_word in self.tokenizer.word_index: # token = self.tokenizer.word_index[similar_word] # print('Word not found: %s but similar word found: %s' % (raw_token, similar_word)) # break if token > -1: tfidf_weights[i][j] = tfidf_matrix[i][token] else: tfidf_weights[i][j] = 1 # default weight to 1 # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) # embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) # print(raw_tokens) embs = word_to_vec(raw_tokens) if embs is None: return None sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float()