Beispiel #1
0
 def transform(self, data, return_mask=None):
     tensor = word_to_vec(data, language='en_elmo')
     for sent in tensor:
         print(np.array(sent, dtype='float32').shape)
     tensor = np.array(tensor, dtype='float32')
     print(tensor.shape)
     if self.concat == 'mean':
         tensor = np.mean(tensor, axis=1)
     elif self.concat == 'max':
         tensor = np.max(tensor, axis=1)
     return tensor
Beispiel #2
0
    def neg_log_likelihood(self, sent_batch, tags):
        word_embeds = to_gpu(
            torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]]))
        word_embeds = self.emb_dropout(word_embeds)

        char_embeds = self.word_encoder(sent_batch[0])

        sentence_in = torch.cat((word_embeds, char_embeds),
                                dim=-1).unsqueeze(1)
        sentence_in = self.dropout(sentence_in)

        feats = self._get_lstm_features(sentence_in)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags[0])
        return feats, forward_score - gold_score
Beispiel #3
0
    def forward(self,
                sent_batch):  # dont confuse this with _forward_alg above.
        word_embeds = to_gpu(
            torch.FloatTensor([word_to_vec(w) for w in sent_batch[0]]))
        word_embeds = self.emb_dropout(word_embeds)

        char_embeds = self.word_encoder(sent_batch[0])

        sentence_in = torch.cat((word_embeds, char_embeds),
                                dim=-1).unsqueeze(1)

        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence_in)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq, sent_batch[0]
Beispiel #4
0
def transform_input(input_list: List[str], model=None) -> np.array:
    X = None
    if not torch.is_tensor(
            X) and model is not None and model._featurizer is not None:
        X = model._featurizer.transform(input_list)
        X = model.preprocess_input(X)
        X = X.numpy()
    else:  # falls back to BoW
        raw_tokens = [word_tokenize(sent) for sent in input_list]
        sent_vectors = word_to_vec(raw_tokens)

        sent_matrix = np.zeros((len(raw_tokens), get_dim()))
        for ix in range(len(input_list)):
            print(sent_vectors[ix])
            sent_matrix[ix] = np.mean(np.array(sent_vectors[ix],
                                               dtype='float32'),
                                      axis=0)

        X = sent_matrix
    return X
Beispiel #5
0
    def preprocess_input(self, X):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        tokens = [self.tokenize_fn(sent) for sent in X]
        tokens = self.tokenizer.texts_to_sequences(tokens)
        tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf')

        maxlen = max([len(sent) for sent in tokens])
        tfidf_weights = np.zeros((len(tokens), maxlen))
        for i, seq in enumerate(tokens):
            for j, token in enumerate(seq):
                if token < self.tokenizer.num_words:
                    tfidf_weights[i][j] = tfidf_matrix[i][token]

        # convert from token back to texts
        # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited)
        embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens))

        sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0)

        return torch.from_numpy(sif_emb).float()
Beispiel #6
0
    def forward(self, sent_batch: List[List[str]]):
        max_length = min(max([len(sent) for sent in sent_batch]),
                         self.config.max_position_embeddings)

        words_embeddings = to_gpu(
            torch.FloatTensor(word_to_vec(sent_batch,
                                          pad_to_length=max_length)))

        chars_embeddings = to_gpu(
            torch.stack([
                torch.cat((self.char_encoder(sent),
                           torch.zeros(max_length -
                                       len(sent), self.char_embedding_dim)),
                          dim=0) if len(sent) < max_length else
                self.char_encoder(sent)[:max_length]
                if len(sent) > max_length else self.char_encoder(sent)
                for sent in sent_batch
            ], 0))

        if self.use_position_embeddings:
            position_ids = torch.arange(max_length,
                                        dtype=torch.long,
                                        device=words_embeddings.device)
            position_ids = position_ids.unsqueeze(0).expand(
                words_embeddings.size(0), words_embeddings.size(1))

            position_embeddings = self.position_embeddings(position_ids)

        embeddings = torch.cat([words_embeddings, chars_embeddings],
                               dim=-1) + position_embeddings

        if self.use_position_embeddings:
            embeddings = words_embeddings + position_embeddings

        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
Beispiel #7
0
    def transform(self, data):
        raw_tokens = [self.tokenize_fn(sent) for sent in data]
        tokens = self.tokenizer.texts_to_sequences(raw_tokens)
        tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf')

        maxlen = max([len(sent) for sent in tokens])
        tfidf_weights = np.zeros((len(tokens), maxlen))
        for i, seq in enumerate(raw_tokens):
            for j, raw_token in enumerate(seq):
                token = -1
                if raw_token in self.tokenizer.word_index:
                    token = self.tokenizer.word_index[raw_token]
                # else:
                #     similar_to_raw_token = most_similar(raw_token)
                #     for similar_word in similar_to_raw_token:
                #         print(similar_to_raw_token)
                #         if similar_word in self.tokenizer.word_index:
                #             token = self.tokenizer.word_index[similar_word]
                #             print('Word not found: %s but similar word found: %s' % (raw_token, similar_word))
                #             break
                if token > -1:
                    tfidf_weights[i][j] = tfidf_matrix[i][token]
                else:
                    tfidf_weights[i][j] = 1  # default weight to 1

        # convert from token back to texts
        # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited)
        # embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens))
        # print(raw_tokens)
        embs = word_to_vec(raw_tokens)

        if embs is None: return None

        sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0)

        return torch.from_numpy(sif_emb).float()