Esempio n. 1
0
class ElmoNer(nn.Module):
    def __init__(self,
                 num_units,
                 rnn_hidden,
                 num_tags,
                 num_layers=1,
                 use_cuda=False):
        super(ElmoNer, self).__init__()
        self.use_cuda = use_cuda
        self.embedding = Embedder(ELMO_PRETAIN_PATH)
        self.rnn = nn.GRU(num_units,
                          rnn_hidden,
                          num_layers=num_layers,
                          batch_first=True,
                          bidirectional=True)
        self.linear = nn.Linear(2 * rnn_hidden, num_tags)
        # self.linear = nn.Linear(num_units, num_tags)
        self.crf = CRF(num_tags)

    def forward(self, x_data, y_data, masks):
        """
		前向算法
		:param x_data:
		:param y_data:
		:param masks:
		:return:
		"""
        encoded_layers = self.embedding.sents2elmo(x_data)
        out = self.rnn_layer(encoded_layers)
        loss = -1 * self.crf(out, y_data.transpose(0, 1), masks.transpose(
            0, 1))
        return loss

    def rnn_layer(self, encoded_layers):
        """
		batch seq_len hidden
		:param encoded_layers:
		:return: batch seq_len class
		"""
        encoded_layers = np.array(encoded_layers)
        encoded_layers = torch.from_numpy(encoded_layers)
        if self.use_cuda:
            encoded_layers = encoded_layers.cuda()
        out, _ = self.rnn(encoded_layers)
        out = self.linear(out)
        out = out.transpose(0, 1)
        return out

    def test(self, x_data, masks):
        encoded_layers = self.embedding.sents2elmo(x_data)

        out = self.rnn_layer(encoded_layers)
        best_paths = self.crf.decode(out, mask=masks.transpose(0, 1))
        return best_paths
Esempio n. 2
0
class ELMOTagger(BaseModel):
    def __init__(self, config, dataset):
        super().__init__(config)
        self.dataset = dataset
        self.output_size = len(dataset.vocabs.pos)
        self.embedder = Embedder(self.config.elmo_model,
                                 batch_size=self.config.batch_size)
        self.elmo_layer = self.config.elmo_layer
        if hasattr(self.config, 'lstm_size'):
            self.lstm = nn.LSTM(1024,
                                self.config.lstm_size,
                                batch_first=True,
                                dropout=self.config.dropout,
                                num_layers=self.config.lstm_num_layers,
                                bidirectional=True)
            hidden_size = self.config.lstm_size * 2
        else:
            self.lstm = None
            hidden_size = 1024
        if self.elmo_layer == 'weighted_sum':
            self.elmo_weights = nn.Parameter(torch.ones(3, dtype=torch.float))
        self.output_proj = nn.Linear(hidden_size, self.output_size)
        # ignore <pad> = 3
        self.criterion = nn.CrossEntropyLoss(
            ignore_index=self.dataset.vocabs.pos['<pad>'])

    def compute_loss(self, batch, output):
        target = to_cuda(torch.LongTensor(batch.pos))
        return compute_sequence_loss(target, output, self.criterion)

    def forward(self, batch):
        batch_size = len(batch[0])
        if self.elmo_layer == 'mean':
            embedded = self.embedder.sents2elmo(batch.sentence, -1)
            embedded = np.stack(embedded)
            embedded = to_cuda(torch.from_numpy(embedded))
        elif self.elmo_layer == 'weighted_sum':
            embedded = self.embedder.sents2elmo(batch.sentence, -2)
            embedded = np.stack(embedded)
            embedded = to_cuda(torch.from_numpy(embedded))
            embedded = (self.elmo_weights[None, :, None, None] *
                        embedded).sum(1)
        else:
            embedded = self.embedder.sents2elmo(batch.sentence,
                                                self.elmo_layer)
            embedded = np.stack(embedded)
            embedded = to_cuda(torch.from_numpy(embedded))
        if self.lstm:
            embedded = self.lstm(embedded)[0]
        return self.output_proj(embedded)
Esempio n. 3
0
def test():
    import numpy as np
    e = Embedder(ELMO_PRETAIN_PATH)
    text = [['今', '天', '天', '气', '真', '好', '阿'],
            ['你', '吃', '饭', '了', '吗', '?', 'd']]
    a = e.sents2elmo(text)
    a = np.array(a)
    print(a.shape)
    print(a)

    new_text = [chs_to_cht(line) for line in text]

    b = e.sents2elmo(new_text)
    print(b[0].shape, b[1].shape)
    print(b)
Esempio n. 4
0
def BuildDataSet(subset='train', output = -1):
  dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset,
                                          remove=('headers', 'footers', 'quotes'),
                                    categories=[cat])['data'],
                                    tokens_only=True,  bFastText=False, bRemoveStopWords=True))\
                        for cat in my_cats]
  corpus = [doc for categroy_list in dataset_list for doc in categroy_list ]

  categories_lengths=[len(cat_liste) for cat_liste in dataset_list]
  categories = [[k for _ in range(0,length)] for k,length in enumerate(categories_lengths)]
  cats = [cat for elem_list in categories for cat in elem_list]  
  y = np.array(cats)
  np.savetxt('ELmo_20news_group_rep\\y_{}.csv'.format(subset), y, delimiter=",")
  
  print ('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus)))
  e = Embedder('..\\PreTrainedElmo_EN', batch_size=64)
  
  try:
    os.remove('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset))
  except:
    pass
  t0 = time()
  np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) 
      for i in range(0,len(corpus)//3)])
  np.savetxt('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset), np_em, delimiter=',')
  print (f'finished generated 1st chunk of elmo reps in {time() - t0} seconds')


  try:
    os.remove('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset))
  except:
    pass
  t0 = time()
  np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) 
      for i in range(len(corpus)//3, 2*(len(corpus)//3))])
  np.savetxt('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset), np_em, delimiter=',')
  print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds')


  try:
    os.remove('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset))
  except:
    pass
  t0 = time()
  np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) 
      for i in range(2*(len(corpus)//3),len(corpus))])
  np.savetxt('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset), np_em, delimiter=',')
  print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds')
Esempio n. 5
0
    def trainELMoToFile(self, elmoModelPath, filename, words, vocab_size):
        """Build ELMo word embeddings
		
		elmoModelPath: the absolute path from the model to process*
		filename: filename to save word embeddings
		words: word list (vocabulary)
		vocab_size: length of word list (vocabulary)

		* visit to https://github.com/HIT-SCIR/ELMoForManyLangs#downloads to download Spanish model and check set ups.
		"""
        from elmoformanylangs import Embedder
        import numpy as np
        e = Embedder(
            elmoModelPath
        )  # path from loaded model (e.g. /home/user/project/ELMo.es/)
        embedding_matrix = np.zeros((vocab_size, 1024))

        for i, word in enumerate(words):
            aux_elmo = e.sents2elmo([[word]])
            with open(filename, 'a') as g:
                strnums = [str(num) for num in aux_elmo[0][0].tolist()]
                strnums = ' '.join(strnums)
                g.write("{} {}\n".format(word, strnums))
                # print ("Processing \t{} of {}...".format( i+1, len(words)) )
            g.close()
Esempio n. 6
0
class Elmo:

    def __init__(self, lang="fr"):
        self.name = "elmo"
        if lang == "fr":
            from elmoformanylangs import Embedder
            self.e = Embedder('/home/bmazoyer/Dev/ELMoForManyLangs/150', batch_size=32)
            self.vectors = None
        elif lang == "en":
            import tensorflow as tf
            import tensorflow_hub as hub
            self.embed = hub.Module("https://tfhub.dev/google/elmo/2")
            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            self.session.run(tf.tables_initializer())
        self.lang = lang

    def populate_array(self, data):
        logging.info(data.name)
        self.vectors[data.name] = np.mean(np.array(self.e.sents2elmo([data.text.split()]))[0], axis=0)

    def compute_vectors(self, data):
        n = data.shape[0]
        self.vectors = np.zeros((n, 1024))
        if self.lang == "fr":
            data.apply(self.populate_array, axis=1)
            return self.vectors
        elif self.lang == "en":
            batch_size = 64
            for i in tqdm(range(0, n, batch_size)):
                self.vectors[i:min(n, i + batch_size)] = self.session.run(
                    self.embed(data.text[i:min(n, i + batch_size)].tolist(), signature="default", as_dict=True)[
                        "default"]
                )
            return self.vectors
Esempio n. 7
0
class PretrainEmbedder(ElmoEmbedding):
    def __init__(self, model_dir, batch_size=64):
        super(PretrainEmbedder, self).__init__()
        self.embedder = Embedder(model_dir, batch_size)

    def predict(self, sentences, layer_index=-1):
        return self.embedder.sents2elmo(sentences, layer_index)
Esempio n. 8
0
def generate_data_set(test_cats=my_cats, subset='train'):
    dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset,
                                            remove=('headers', 'footers', 'quotes'),
                                        categories=[cat])['data'],
                                        tokens_only=True,  bFastText=False, bRemoveStopWords=True))\
                            for cat in test_cats]
    corpus = [doc for categroy_list in dataset_list for doc in categroy_list]

    categories_lengths = [len(cat_liste) for cat_liste in dataset_list]
    categories = [[k for _ in range(0, length)]
                  for k, length in enumerate(categories_lengths)]

    cats = [cat for elem_list in categories for cat in elem_list]
    y = np.array(cats)
    np.savetxt('ELmo_20news_group_rep\\y_train_reduced.csv',
               y[range(0, len(cats), 50)],
               delimiter=",")

    print('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus)))
    e = Embedder('..\\PreTrainedElmo_EN', batch_size=64)

    try:
        os.remove('ELmo_20news_group_rep\\X_train_reduced.csv')
    except:
        pass

    with open('ELmo_20news_group_rep\\X_train_reduced.csv',
              mode='a') as myFile:
        # for i in range(0,2):
        for i in range(0, len(corpus), 50):
            em = np.mean(e.sents2elmo([corpus[i]])[0], axis=0)
            # print (em.shape)
            myFile.write('{}'.format(
                em.tolist()).strip('[').strip(']').replace(' ', ''))
            myFile.write('\n')
Esempio n. 9
0
class WordEmbeddings():
    """
        ELMo
        https://allennlp.org/elmo

    """
    def __init__(self,
                 model_path=r'../auxiliary_data/zhs.model/',
                 cuda_device=0):
        self.cuda_device = cuda_device
        self.elmo = Embedder(model_path)

    def get_tokenized_words_embeddings(self, sents_tokened):
        """
        @see EmbeddingDistributor
        :param tokenized_sents: list of tokenized words string (sentences/phrases)
        :return: ndarray with shape (len(sents), dimension of embeddings)
        """
        max_len = max([len(sent) for sent in sents_tokened])
        elmo_embedding = self.elmo.sents2elmo(sents_tokened, output_layer=-2)
        elmo_embedding = [
            np.pad(emb,
                   pad_width=((0, 0), (0, max_len - emb.shape[1]), (0, 0)),
                   mode='constant') for emb in elmo_embedding
        ]
        elmo_embedding = torch.from_numpy(np.array(elmo_embedding))
        return elmo_embedding


# if __name__ == '__main__':
#     sents = [['今', '天', '天气', '真', '好', '啊'],
#              ['潮水', '退', '了', '就', '知道', '谁', '没', '穿', '裤子']]
#     elmo = WordEmbeddings()
#     embs = elmo.get_tokenized_words_embeddings(sents)
#     print("OK")
Esempio n. 10
0
class ElmoModel(BaseModel):
    def __init__(self, path_to_model):
        self._embedder = Embedder(path_to_model)

    def process(self, sentences):
        return [
            np.mean(embeds, axis=0)
            for embeds in self._embedder.sents2elmo(sentences)
        ]
Esempio n. 11
0
def get_elmo_vector(vocab: list) -> np.ndarray:

    from elmoformanylangs import Embedder
    e = Embedder('{}/zhs.model'.format(DATA_DIR))

    vectors = np.zeros((len(vocab), 1024))
    vectors[2:] = e.sents2elmo(vocab[2:])
    scio.savemat('{}/elmo.mat'.format(DATA_DIR), {'vectors': vectors})
    return vectors
Esempio n. 12
0
class Foreign_Elmo():
    def __init__(self, dir_name, embedding_source, device=None):
        self.embedding_source = embedding_source
        self.device = device
        print('loading the embedder')
        self.e = Embedder(dir_name)
        print('finish loading the embedder')

    def _get_embeddings(self, sent_lst):
        # if self.embedding_source  == 'elmo_0':
        #     type = self.e.sents2elmo(sent_lst, 0)
        #     type = [torch.tensor(x).unsqueeze(1) for x in type]
        #     return type, type
        # else:
        full_type = []
        full_token = []
        for batch_ in sent_lst:
            # print(len(batch_), len(batch_[0]))
            result = torch.tensor(self.e.sents2elmo(batch_, -2))
            # print(result.shape)
            type_ = result[:, 0, :, :]
            if self.embedding_source == 'elmo_1':
                index = 1
            elif self.embedding_source == 'elmo_2':
                index = 2
            elif self.embedding_source == 'elmo_0':
                index = 0
            token_ = result[:, index, :, :]
            full_type.append(type_)
            full_token.append(token_)
        return full_token, full_type

    def get_part_elmo(self, batch_):
        result = torch.tensor(self.e.sents2elmo(batch_, -2))
        # print(result.shape)
        if self.embedding_source == 'elmo_1':
            index = 1
        elif self.embedding_source == 'elmo_2':
            index = 2
        elif self.embedding_source == 'elmo_0':
            index = 0
        token_ = result[:, index, :, :]
        return token_
Esempio n. 13
0
class embed(object):
    def __init__(self, source, path):
        self.source = source
        self.path = path
        self._load_model()
    def _aravec(self, path):
        self.model = gensim.models.Word2Vec.load(path)
        self.vector_size = self.model.vector_size
    def _fasttext(self, path):
        self.model = fasttext.load_model(path)
        self.vector_size = self.model.get_dimension()
    def _elmo(self, path):
        self.model = Embedder(path, 64)
        
    def _load_model(self):
        if self.source == "aravec":
            self._aravec(self.path)
        elif self.source == "fasttext":
            self._fasttext(self.path)
        elif self.source == "elmo":
            self._elmo(self.path)
        else:
            raise ValueError("Model not supported. Please select either aravec, fasttext or elmo")
   
    def _embed_single(self, text, max_len):
        if self.source == "aravec":
            embedding = [self.model.wv[i].reshape(-1, self.vector_size) for i in text.split() if i in self.model.wv]
            if len(embedding) == 0:
                return self._pad(np.zeros((1, self.vector_size)), max_len)
            embedding = np.concatenate(embedding, axis=0)
            return self._pad(embedding, max_len)
        if self.source == "fasttext":
            embedding = [self.model.get_word_vector(i).reshape(-1, self.vector_size) for i in text.split()]
            embedding = np.concatenate(embedding, axis=0)
            return self._pad(embedding, max_len)
                    
    def embed_batch(self, text_list, max_len):
        if self.source == "elmo":
            input_segmented = [i.split() for i in text_list]
            embedding = self.model.sents2elmo(input_segmented)
            embedding = [self._pad(i, max_len) for i in embedding]            
            return np.concatenate(embedding, axis=0)
        else:    
            batch = [self._embed_single(i, max_len) for i in text_list]
            return np.concatenate(batch)
    
    def _pad(self, array, max_len):
        if array.shape[0] >= max_len:
            return np.expand_dims(array[:max_len],0)
        else:
            padding_size = max_len - array.shape[0]
            return np.expand_dims(np.pad(array, [(0, padding_size), (0, 0)], mode='constant', constant_values=0), 0)
def elmo_data(samples):

    from elmoformanylangs import Embedder
    elmo = Embedder('data/elmo_model', batch_size=1)

    data = SimpleNamespace()
    data.x = elmo.sents2elmo([sample.text for sample in samples])
    data.x = np.array([
        np.mean(datum, axis=0)
        for datum
        in data.x
    ])
    data.y = np.array([sample.label for sample in samples])

    return data
Esempio n. 15
0
def test_elmoformanylangs():

    e = Embedder('/Users/feili/project/ELMoForManyLangs/output/en')
    # e = Embedder('/Users/feili/resource/data_to_train_emb/elmo_ade_lower_0norm_200d')

    sents = [['LABA', ',', 'such', 'as', 'vilanterol']]
    # for idx, sent in enumerate(sents):
    #     for idy, tk in enumerate(sent):
    #         sents[idx][idy] = sents[idx][idy].lower()

    ret = e.sents2elmo(sents)
    # will return a list of numpy arrays
    # each with the shape=(seq_len, embedding_size)

    pass
Esempio n. 16
0
def load_elmo(all_tokens, language=""):
    # here, we will load ELMO.
    # For Chinese + Vietnamese, we use pretrained https://github.com/HIT-SCIR/ELMoForManyLangs
    all_tokens = list(set(all_tokens))
    current_word2idx = {all_tokens[i]: i + 4 for i in range(len(all_tokens))}
    current_word2idx["unk"] = 1
    current_word2idx["<SOS>"] = 2
    current_word2idx["<EOS>"] = 3
    weights = []
    elmo = ElmoEmbedder()
    if language == "zh":
        e = Embedder('179')
        weights = e.sents2elmo(all_tokens)
        weights = preprocess_weights(weights)
        # so this basically returns a 3x 1024 for any words that are
        # in the token_embedding
        # and nan for any words not in the embedding
    elif language == "vi":
        pdb.set_trace()
        e = Embedder('178')
        weights = e.sents2elmo(all_tokens)
        weights = preprocess_weights(weights)
    else:
        weights = elmo.embed_sentence(all_tokens)
        weights = np.mean(weights, axis=0)
    # take the average
    final_weights = []
    final_weights.append([0] * EMBED_DIM)
    unknown_vector = list(np.random.normal(size=(EMBED_DIM, )))
    start_vector = list(np.random.normal(size=(EMBED_DIM, )))
    end_vector = list(np.random.normal(size=(EMBED_DIM, )))
    final_weights.append(unknown_vector)
    final_weights.append(start_vector)
    final_weights.append(end_vector)
    final_weights.extend(weights)
    return current_word2idx, final_weights
Esempio n. 17
0
class ElmoEncoder(BaseTextEncoder):
    is_trained = True
    batch_size = 64

    def __init__(self,
                 model_dir: str,
                 pooling_layer: int = -1,
                 pooling_strategy: str = 'REDUCE_MEAN',
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        self.model_dir = model_dir

        if pooling_layer > 2:
            raise ValueError('pooling_layer = %d is not supported now!' %
                             pooling_layer)
        self.pooling_layer = pooling_layer
        self.pooling_strategy = pooling_strategy

    def post_init(self):
        from elmoformanylangs import Embedder
        from ...helper import Tokenizer
        self._elmo = Embedder(model_dir=self.model_dir,
                              batch_size=self.batch_size)
        self.cn_tokenizer = Tokenizer()

    @batching
    def encode(self, text: List[str], *args, **kwargs) -> np.ndarray:
        # tokenize text
        batch_tokens = [self.cn_tokenizer.tokenize(sent) for sent in text]

        elmo_encodes = self._elmo.sents2elmo(batch_tokens, output_layer=-2)

        pooled_data = []
        for token_encodes in elmo_encodes:
            if self.pooling_layer == -1:
                _layer_data = np.average(token_encodes, axis=0)
            elif self.pooling_layer >= 0:
                _layer_data = token_encodes[self.pooling_layer]
            else:
                raise ValueError('pooling_layer = %d is not supported now!' %
                                 self.pooling_layer)

            _pooled = pooling_np(_layer_data, self.pooling_strategy)
            pooled_data.append(_pooled)
        return np.array(pooled_data, dtype=np.float32)
Esempio n. 18
0
class ELMo_Model():
    # follow the below repo to download the data
    # https://github.com/HIT-SCIR/ELMoForManyLangs
    def __init__(self):
        self.model = Embedder('elmo_model/')

    def similarity(self, word1: str, word2: str):
        if word1 is None or word2 is None:
            return None
        else:
            try:
                vec_word1 = self.model.sents2elmo([[word1]])[0]
                vec_word2 = self.model.sents2elmo([[word2]])[0]
                cos_similarity = cosine_similarity(vec_word1,vec_word2)[0][0]
            except KeyError:
                cos_similarity = 0
            return cos_similarity
Esempio n. 19
0
class PhraseNeighbors:
    def __init__(self, model_dir):
        self.faiss_db = faiss.read_index(os.path.join(model_dir, 'faiss_db'))
        with open(os.path.join(model_dir, 'faiss_lookup.json')) as f:
            self.faiss_lookup = json.loads(f.read())
        self.elmo = Embedder(os.path.join(model_dir, 'elmo_nl'))
        self.tokenizer = Tokenizer()

    def query(self, s, topn):
        s = [w.value for w in self.tokenizer.tokenize(s)]
        X = self.elmo.sents2elmo([s])
        X = np.array([x.mean(axis=0) for x in X])

        distances, indices = self.faiss_db.search(X, k=topn)

        return [(self.faiss_lookup[i], d)
                for i, d in zip(indices[0], distances[0])]
class elmo_embedding_layer(nn.Module):
    def __init__(self, vocab):
        super(elmo_embedding_layer, self).__init__()
        self.embed = Embedder('./embed_module/elmo/chinese')
        self.vocab = vocab

    def forward(self, x):
        '''
        保证调用时代码的一致性,这里的输入为batch_sz x seq_len的tensor
        步骤:
        1.使用vocab转换为sentence list
        2.forward
        :param x:tensor [batch_sz x seq_len]
        :return:tensor [batch_sz x seq_len x dim]
        '''
        seq_batch = [[self.vocab[i] for i in seq] for seq in x]
        seq_embed = self.embed.sents2elmo(seq_batch)
        return torch.Tensor(seq_embed)
def trans_elmo(dic):
    e = Embedder('pretrained/')
    sents = []
    seg = Wordseg(batch_size=8,
                  device="cuda:0",
                  embedding='elmo',
                  elmo_use_cuda=False,
                  mode="TW")
    for key in dic:
        sents.append(dic[key])
    sents = seg.cut(sents)
    vector = e.sents2elmo(sents, -1)
    print(len(vector))
    i = 0
    for key in dic:
        dic[key] = vector[i][0]
        i += 1
    return dic
Esempio n. 22
0
class ElmoEmbeddings:
    def __init__(
            self,
            modelPath="/home/joan/Escritorio/ELMoForManyLangs-master/EN/"):
        self.elmo = Embedder(modelPath)

    def getSentenceVector(self, sentence):
        vecs = self.elmo.sents2elmo(sentence)
        vectorList = []
        for vec in vecs:
            vectorList.append(vec[0].tolist())

        return vectorList

    def getCentroid(self, vectors):
        avgVector = np.mean(vectors, axis=0)
        return avgVector.tolist()

    def distance(self, A, B, distance="cosine"):
        return cdist([A], [B], distance)
Esempio n. 23
0
class WordEmbeddings():
    """
        ELMo
        https://allennlp.org/elmo

    """
    def __init__(self,
                 model_path=r'../auxiliary_data/zhs.model/',
                 cuda_device=0):
        self.cuda_device = cuda_device
        self.elmo = Embedder(model_path)

    def get_tokenized_words_embeddings(self, sents_tokened):
        """
        @see EmbeddingDistributor
        :param tokenized_sents: list of tokenized words string (sentences/phrases)
        :return: ndarray with shape (len(sents), dimension of embeddings)
        """

        elmo_embedding = self.elmo.sents2elmo(
            sents_tokened, output_layer=-2)  #list(tensor), list长度为句子的长度
        # elmo_embedding = torch.from_numpy(np.array(elmo_embedding))
        return elmo_embedding
Esempio n. 24
0
def get_elmo_embedding(elmo_floder, token_file, sentence_vector_file):
    file_data = open(token_file, 'r', encoding="UTF-8")
    data_lines = file_data.readlines()
    batch_size = 32
    file_data.close()
    # 按照一句一个向量的标准保存 将一个单句存到一行 原代码返回的特征值是要相加的 return q1_features + q2_features
    sentences = []
    for line in data_lines:
        sentences.append(line.strip().split(" "))
    print("句子数量: ", len(sentences))
    elmo = Embedder(elmo_floder)
    elmo_vectors = []
    for i in tqdm(range(int(len(sentences) / batch_size) + 1)):
        sentences_curr = sentences[i * batch_size:i * batch_size + batch_size]
        embedding = elmo.sents2elmo(sentences_curr, output_layer=-1)  # 1024维度
        elmo_vectors += embedding

    assert len(sentences) == len(
        elmo_vectors), "len(data_lines) != len(elmo_vectors)"
    print("len(elmo_vectors): ", len(elmo_vectors))
    output_file = open(sentence_vector_file, 'wb', encoding='UTF-8')
    pk.dump(elmo_vectors, output_file)
    output_file.close()
Esempio n. 25
0
def classify():

    y = np.loadtxt('ELmo_20news_group_rep\\y_train_reduced.csv', delimiter=',')
    X = np.loadtxt('ELmo_20news_group_rep\\X_train_reduced.csv', delimiter=',')

    centroids = [['car', 'engine', 'drive', 'speed'],
                 ['religion', 'jesus', 'god', 'believe', 'heaven', 'sin'],
                 [
                     'baseball', 'player', 'run', 'sport', 'hit', 'bat',
                     'rotation'
                 ],
                 ['electronics', 'conductive', 'power', 'resistor', 'circuit'],
                 ['medical', 'methodology', 'science', 'molecule', 'virus']]

    e = Embedder('..\\PreTrainedElmo_EN', batch_size=64)
    em_vecs = [
        np.mean(e.sents2elmo(cat_taxo)[0], axis=0) for cat_taxo in centroids
    ]

    # X_train = np.loadtxt('custom_doc2vec_data\\X_train.csv', delimiter=',')
    # y_train = np.loadtxt('custom_doc2vec_data\\y_train.csv', delimiter=',')
    # dist = met.pairwise_distances(X= X_train,Y=list_centroids_vectors[0].reshape(1, -1),metric='cosine')
    dist = met.pairwise_distances(X=X, Y=np.vstack(em_vecs), metric='cosine')
    print(dist.shape)
    indexes = np.argmin(dist, axis=1)

    diff_list = (indexes - y).tolist()
    diff = [1 if d == 0 else 0 for d in diff_list]
    print('taxonomy-based semi supervised classification accuracy : {}'.format(
        sum(diff) / len(diff)))

    plt.plot(indexes)

    plt.figure()
    plt.plot(y)

    plt.show()
Esempio n. 26
0
class BiaffineDependencyParser(Parser):
    r"""
    The implementation of Biaffine Dependency Parser.

    References:
        - Timothy Dozat and Christopher D. Manning. 2017.
          `Deep Biaffine Attention for Neural Dependency Parsing`_.

    .. _Deep Biaffine Attention for Neural Dependency Parsing:
        https://openreview.net/forum?id=Hk95PK9le
    """

    NAME = 'biaffine-dependency'
    MODEL = BiaffineDependencyModel

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if self.args.feat in ('char', 'bert', 'elmo'):
            self.WORD, self.FEAT = self.transform.FORM
        else:
            self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS
        self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL
        self.puncts = torch.tensor([i
                                    for s, i in self.WORD.vocab.stoi.items()
                                    if ispunct(s)]).to(self.args.device)
        if self.args.elmo_options:
            self.elmo = ElmoEmbedder(self.args.elmo_options, self.args.elmo_weights, -1)
        else:
            self.efml = EFML(self.args.elmo_weights)
            self.elmo = False
        #print(self.__dict__)
        if self.args.map_method == 'vecmap':
            self.mapper = Vecmap(vars(self.args))
        elif self.args.map_method == 'elmogan':
            self.mapper = Elmogan(vars(self.args))
        elif self.args.map_method == 'muse':
            self.mapper = Muse(vars(self.args))
        else:
            self.mapper = None
            
    def train(self, train, dev, test, buckets=32, batch_size=5000,
              punct=False, tree=False, proj=False, partial=False, verbose=True, **kwargs):
        r"""
        Args:
            train/dev/test (list[list] or str):
                Filenames of the train/dev/test datasets.
            buckets (int):
                The number of buckets that sentences are assigned to. Default: 32.
            batch_size (int):
                The number of tokens in each batch. Default: 5000.
            punct (bool):
                If ``False``, ignores the punctuations during evaluation. Default: ``False``.
            tree (bool):
                If ``True``, ensures to output well-formed trees. Default: ``False``.
            proj (bool):
                If ``True``, ensures to output projective trees. Default: ``False``.
            partial (bool):
                ``True`` denotes the trees are partially annotated. Default: ``False``.
            verbose (bool):
                If ``True``, increases the output verbosity. Default: ``True``.
            kwargs (dict):
                A dict holding the unconsumed arguments that can be used to update the configurations for training.
        """

        return super().train(**Config().update(locals()))

    def evaluate(self, data, buckets=8, batch_size=5000,
                 punct=False, tree=True, proj=False, partial=False, verbose=True, **kwargs):
        r"""
        Args:
            data (str):
                The data for evaluation, both list of instances and filename are allowed.
            buckets (int):
                The number of buckets that sentences are assigned to. Default: 32.
            batch_size (int):
                The number of tokens in each batch. Default: 5000.
            punct (bool):
                If ``False``, ignores the punctuations during evaluation. Default: ``False``.
            tree (bool):
                If ``True``, ensures to output well-formed trees. Default: ``False``.
            proj (bool):
                If ``True``, ensures to output projective trees. Default: ``False``.
            partial (bool):
                ``True`` denotes the trees are partially annotated. Default: ``False``.
            verbose (bool):
                If ``True``, increases the output verbosity. Default: ``True``.
            kwargs (dict):
                A dict holding the unconsumed arguments that can be used to update the configurations for evaluation.

        Returns:
            The loss scalar and evaluation results.
        """
        if kwargs['elmo_options']:
            self.elmo = ElmoEmbedder(kwargs['elmo_options'], kwargs['elmo_weights'], -1)
        else:
            self.efml = EFML(kwargs['elmo_weights'])
        if kwargs['map_method'] == 'vecmap':
            self.mapper = Vecmap(kwargs)
            #print(self.mapper)
        elif kwargs['map_method'] == 'elmogan':
            self.mapper = Elmogan(kwargs)
            #print(self.mapper)
        elif kwargs['map_method'] == 'muse':
            self.mapper = Muse(kwargs)
        else:
            self.mapper = None

        return super().evaluate(**Config().update(locals()))

    def predict(self, data, pred=None, buckets=8, batch_size=5000,
                prob=False, tree=True, proj=False, verbose=True, **kwargs):
        r"""
        Args:
            data (list[list] or str):
                The data for prediction, both a list of instances and filename are allowed.
            pred (str):
                If specified, the predicted results will be saved to the file. Default: ``None``.
            buckets (int):
                The number of buckets that sentences are assigned to. Default: 32.
            batch_size (int):
                The number of tokens in each batch. Default: 5000.
            prob (bool):
                If ``True``, outputs the probabilities. Default: ``False``.
            tree (bool):
                If ``True``, ensures to output well-formed trees. Default: ``False``.
            proj (bool):
                If ``True``, ensures to output projective trees. Default: ``False``.
            verbose (bool):
                If ``True``, increases the output verbosity. Default: ``True``.
            kwargs (dict):
                A dict holding the unconsumed arguments that can be used to update the configurations for prediction.

        Returns:
            A :class:`~supar.utils.Dataset` object that stores the predicted results.
        """
        
        if kwargs['elmo_options']:
            self.elmo = ElmoEmbedder(kwargs['elmo_options'], kwargs['elmo_weights'], -1)
        else:
            self.efml = EFML(kwargs['elmo_weights'])
        if kwargs['map_method'] == 'vecmap':
            self.mapper = Vecmap(kwargs)
            #print(self.mapper)
        elif kwargs['map_method'] == 'elmogan':
            self.mapper = Elmogan(kwargs)
            #print(self.mapper)
        elif kwargs['map_method'] == 'muse':
            self.mapper = Muse(kwargs)
        else:
            self.mapper = None
        return super().predict(**Config().update(locals()))

    def _train(self, loader):
        self.model.train()

        bar, metric = progress_bar(loader), AttachmentMetric()
        # words, feats, etc. come from loader! loader is train.loader, where train is Dataset
        for words, feats, arcs, rels in bar:
            self.optimizer.zero_grad()
            if self.elmo:
                feat_embs = self.elmo.embed_batch(feats)
            else:
                feat_embs = self.efml.sents2elmo(feats, output_layer=-2)
            #TODO: dodaj mapping, ce in samo ce gre za vecmap
            if self.args.map_method == 'vecmap':
                # map feat_embs with vecmap, actually self.mapper defined in class init
                feat_embs = self.mapper.map_batch(feat_embs)
                
            mask = words.ne(self.WORD.pad_index)
            # ignore the first token of each sentence
            mask[:, 0] = 0
            
            feats0 = torch.zeros(words.shape+(1024,)) # words.clone()
            feats1 = torch.zeros(words.shape+(1024,))
            feats2 = torch.zeros(words.shape+(1024,))
            # words get ignored, all input comes from feats - 3 elmo layers
            # still inputting words due to reasons(tm)
            
            #feats0 = feats0.unsqueeze(-1)
            #feats0 = feats0.expand(words.shape+(1024,))
            for sentence in range(len(feat_embs)):
                for token in range(len(feat_embs[sentence][1])):
                    feats0[sentence][token] = torch.Tensor(feat_embs[sentence][0][token])
                    feats1[sentence][token] = torch.Tensor(feat_embs[sentence][1][token])
                    feats2[sentence][token] = torch.Tensor(feat_embs[sentence][2][token])
            feats = torch.cat((feats0, feats1, feats2), -1)
            if str(self.args.device) == '-1':
                feats = feats.to('cpu')
            else:
                feats = feats.to('cuda:'+str(self.args.device)) #TODO: fix to allow cpu or gpu
            s_arc, s_rel = self.model(words, feats) #INFO: here is the data input, y = model(x)
            loss = self.model.loss(s_arc, s_rel, arcs, rels, mask, self.args.partial)
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), self.args.clip)
            self.optimizer.step()
            self.scheduler.step()

            arc_preds, rel_preds = self.model.decode(s_arc, s_rel, mask)
            if self.args.partial:
                mask &= arcs.ge(0)
            # ignore all punctuation if not specified
            if not self.args.punct:
                mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
            metric(arc_preds, rel_preds, arcs, rels, mask)
            bar.set_postfix_str(f"lr: {self.scheduler.get_last_lr()[0]:.4e} - loss: {loss:.4f} - {metric}")

    @torch.no_grad()
    def _evaluate(self, loader):
        print("called _evaluate function")
        print(self.mapper)
        self.model.eval()

        total_loss, metric = 0, AttachmentMetric()

        for words, feats, arcs, rels in loader:
            if self.elmo:
                feat_embs0 = self.elmo.embed_batch(feats)
            else:
                feat_embs0 = self.efml.sents2elmo(feats, output_layer=-2)
            if self.mapper:
                # map feat_embs with self.mapper defined in class init
                feat_embs = self.mapper.map_batch(feat_embs0)
            else:
                feat_embs = feat_embs0
            mask = words.ne(self.WORD.pad_index)
            # ignore the first token of each sentence
            mask[:, 0] = 0
            feats0 = torch.zeros(words.shape+(1024,))
            feats1 = torch.zeros(words.shape+(1024,))
            feats2 = torch.zeros(words.shape+(1024,))
            for sentence in range(len(feat_embs)):
                for token in range(len(feat_embs[sentence][1])):
                    feats0[sentence][token] = torch.Tensor(feat_embs[sentence][0][token])
                    feats1[sentence][token] = torch.Tensor(feat_embs[sentence][1][token])
                    feats2[sentence][token] = torch.Tensor(feat_embs[sentence][2][token])
            feats = torch.cat((feats0, feats1, feats2), -1)
            if str(self.args.device) == '-1':
                feats = feats.to('cpu')
            else:
                feats = feats.to('cuda:'+str(self.args.device))
            s_arc, s_rel = self.model(words, feats)
            loss = self.model.loss(s_arc, s_rel, arcs, rels, mask, self.args.partial)
            arc_preds, rel_preds = self.model.decode(s_arc, s_rel, mask,
                                                     self.args.tree,
                                                     self.args.proj)
            if self.args.partial:
                mask &= arcs.ge(0)
            # ignore all punctuation if not specified
            if not self.args.punct:
                mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
            total_loss += loss.item()
            metric(arc_preds, rel_preds, arcs, rels, mask)
        total_loss /= len(loader)

        return total_loss, metric

    @torch.no_grad()
    def _predict(self, loader):
        self.model.eval()

        preds = {}
        arcs, rels, probs = [], [], []
        for words, feats in progress_bar(loader):
            if self.elmo:
                feat_embs = self.elmo.embed_batch(feats)
            else:
                feat_embs = self.efml.sents2elmo(feats, output_layer=-2)
            if self.mapper:
                # map feat_embs with self.mapper defined in class init
                feat_embs = self.mapper.map_batch(feat_embs)
            mask = words.ne(self.WORD.pad_index)
            # ignore the first token of each sentence
            mask[:, 0] = 0
            lens = mask.sum(1).tolist()
            feats0 = torch.zeros(words.shape+(1024,))
            feats1 = torch.zeros(words.shape+(1024,))
            feats2 = torch.zeros(words.shape+(1024,))
            for sentence in range(len(feat_embs)):
                for token in range(len(feat_embs[sentence][1])):
                    feats0[sentence][token] = torch.Tensor(feat_embs[sentence][0][token])
                    feats1[sentence][token] = torch.Tensor(feat_embs[sentence][1][token])
                    feats2[sentence][token] = torch.Tensor(feat_embs[sentence][2][token])
            feats = torch.cat((feats0, feats1, feats2), -1)
            if str(self.args.device) == '-1':
                feats = feats.to('cpu')
            else:
                feats = feats.to('cuda:'+str(self.args.device))
            s_arc, s_rel = self.model(words, feats)
            arc_preds, rel_preds = self.model.decode(s_arc, s_rel, mask,
                                                     self.args.tree,
                                                     self.args.proj)
            arcs.extend(arc_preds[mask].split(lens))
            rels.extend(rel_preds[mask].split(lens))
            if self.args.prob:
                arc_probs = s_arc.softmax(-1)
                probs.extend([prob[1:i+1, :i+1].cpu() for i, prob in zip(lens, arc_probs.unbind())])
        arcs = [seq.tolist() for seq in arcs]
        rels = [self.REL.vocab[seq.tolist()] for seq in rels]
        preds = {'arcs': arcs, 'rels': rels}
        if self.args.prob:
            preds['probs'] = probs

        return preds

    @classmethod
    def build(cls, path, min_freq=2, fix_len=20, **kwargs):
        r"""
        Build a brand-new Parser, including initialization of all data fields and model parameters.

        Args:
            path (str):
                The path of the model to be saved.
            min_freq (str):
                The minimum frequency needed to include a token in the vocabulary. Default: 2.
            fix_len (int):
                The max length of all subword pieces. The excess part of each piece will be truncated.
                Required if using CharLSTM/BERT.
                Default: 20.
            kwargs (dict):
                A dict holding the unconsumed arguments.
        """

        args = Config(**locals())
        args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        if os.path.exists(path) and not args.build:
            parser = cls.load(**args)
            parser.model = cls.MODEL(**parser.args)
            parser.model.load_pretrained(parser.WORD.embed).to(args.device)
            return parser

        logger.info("Building the fields")
        WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
        if args.feat == 'char':
            FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len)
        elif args.feat == 'bert':
            from transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained(args.bert)
            FEAT = SubwordField('bert',
                                pad=tokenizer.pad_token,
                                unk=tokenizer.unk_token,
                                bos=tokenizer.bos_token or tokenizer.cls_token,
                                fix_len=args.fix_len,
                                tokenize=tokenizer.tokenize)
            FEAT.vocab = tokenizer.get_vocab()
        elif args.feat == 'elmo':
            logger.info("Hello, initing ElmoField")
            FEAT = ElmoField('elmo', bos=bos) #
        else:
            FEAT = Field('tags', bos=bos)
        ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs)
        REL = Field('rels', bos=bos)
        if args.feat in ('char', 'bert'):
            transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL)
        elif args.feat == 'elmo':
            logger.info("calling CoNLL transform")
            # FEAT ima se kar 3 layerje, to bo za popravit nekak
            transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL)
        else:
            transform = CoNLL(FORM=WORD, CPOS=FEAT, HEAD=ARC, DEPREL=REL)
        logger.info("initing train Dataset")
        train = Dataset(transform, args.train)
        #WORD.build(train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None))
        logger.info("Building WORD, FEAT, REL fields")
        WORD.build(train)
        FEAT.build(train)
        REL.build(train)
        args.update({
            'n_words': WORD.vocab.n_init,
            'n_feats': len(FEAT.vocab),
            'n_rels': len(REL.vocab),
            'pad_index': WORD.pad_index,
            'unk_index': WORD.unk_index,
            'bos_index': WORD.bos_index,
            'feat_pad_index': FEAT.pad_index,
        })
        logger.info("Loading model")
        model = cls.MODEL(**args)
        model.load_pretrained(WORD.embed).to(args.device)
        return cls(args, model, transform)
Esempio n. 27
0
class DataModel():
    def __init__(self, opt):

        super(DataModel, self).__init__()
        self.opt = opt
        self.use_gpu = self.opt.use_gpu

        if self.opt.emb_method == 'elmo':
            self.init_elmo()
        elif self.opt.emb_method == 'glove':
            self.init_glove()
        elif self.opt.emb_method == 'word2vec':
            self.init_word2vec()
        elif self.opt.emb_method == 'elmo_word2vec':
            self.init_word2vec()
            self.init_elmo()
            self.word_dim = self.opt.elmo_dim + self.opt.word2vec_dim
        elif self.opt.emb_method == 'elmo_glove':
            self.init_elmo()
            self.init_glove()
            self.word_dim = self.opt.elmo_dim + self.opt.glove_dim
        elif self.opt.emb_method == 'all':
            self.init_elmo()
            self.init_glove()
            self.init_word2vec()
            self.word_dim = self.opt.elmo_dim + self.opt.glove_dim + self.opt.word2vec_dim

    def init_elmo(self):
        '''
        initilize the ELMo model
        '''
        self.elmo = Embedder(self.opt.elmo_model,
                             batch_size=self.opt.batch_size)

    def init_word2vec(self):
        self.word2vec_word2id = np.load(self.opt.word2vec_w2id_file).tolist()
        self.word2vec_vocab_size = len(self.word2vec_word2id)
        self.word2vec = nn.Embedding(self.word2vec_vocab_size,
                                     self.opt.word2vec_dim)
        emb = torch.from_numpy(np.load(self.opt.word2vec_file))
        self.word2vec.weight.data.copy_(emb)

    def init_glove(self):
        self.glove_word2id = np.load(self.opt.glove_w2id_file).tolist()
        self.glove_vocab_size = len(self.glove_word2id)
        self.glove = nn.Embedding(self.glove_vocab_size, self.opt.glove_dim)
        emb = torch.from_numpy(np.load(self.opt.glove_file))
        self.glove.weight.data.copy_(emb)

    def get_elmo(self, sentence_lists):
        '''
        get the ELMo word embedding vectors for a sentences
        '''
        max_len = max(map(lambda x: len(x), sentence_lists))
        sentence_lists = self.elmo.sents2elmo(sentence_lists)
        sentence_end = []
        # 统一长度
        for sentence in sentence_lists:
            sentence = sentence.tolist()
            for i in range(max_len - len(sentence)):
                sentence.append([0] * self.opt.elmo_dim)
            sentence_end.append(sentence)
        return torch.FloatTensor(sentence_end)

    def get_word2vec(self, sentence_lists):
        # 分词之后的 向量
        max_len = max(map(lambda x: len(x), sentence_lists))
        sentence_lists = list(
            map(
                lambda x: list(
                    map(lambda w: self.word2vec_word2id.get(w, 1), x)),
                sentence_lists))
        # 补充 全部是 0  的向量
        sentence_lists = list(
            map(lambda x: x + [0] * (max_len - len(x)), sentence_lists))
        sentence_lists = torch.LongTensor(sentence_lists)
        embeddings = self.word2vec(sentence_lists)
        return embeddings

    #分字之后的向量
    def get_glove(self, sentence_lists):
        '''
        get the glove word embedding vectors for a sentences
        '''
        max_len = max(map(lambda x: len(x), sentence_lists))
        # UNK   --> 1
        sentence_lists = list(
            map(lambda x: list(map(lambda w: self.word2id.get(w, 1), x)),
                sentence_lists))
        # 补充全部是 0  的向量
        sentence_lists = list(
            map(lambda x: x + [0] * (max_len - len(x)), sentence_lists))
        sentence_lists = torch.LongTensor(sentence_lists)
        embeddings = self.glove(sentence_lists)
        return embeddings

    def get_data(self, x):
        if self.opt.emb_method == 'elmo':
            word_embs = self.get_elmo(x)
        elif self.opt.emb_method == 'glove':
            word_embs = self.get_glove(x)
        elif self.opt.emb_method == 'elmo_word2vec':
            word2vec = self.get_word2vec(x)
            elmo = self.get_elmo(x)
            word_embs = torch.cat([elmo, word2vec], -1)
        elif self.opt.emb_method == 'elmo_glove':
            glove = self.get_glove(x)
            elmo = self.get_elmo(x)
            word_embs = torch.cat([elmo, glove], -1)
        elif self.opt.emb_method == 'all':
            glove = self.get_glove(x)
            word2vec = self.get_word2vec(x)
            elmo = self.get_elmo(x)
            word_embs = torch.cat([elmo, glove, word2vec], -1)
        return word_embs
Esempio n. 28
0
                tokenlist.append(token)
    tokenlist = tokenlist + test_tokenlist
    return tokenlist


if __name__ == "__main__":

    filename = 'UD_English-EWT'
    language = "English-EWT"
    tokenlist = data_preprocesser(filename)
    word_data = data_extractor(tokenlist)
    word_index, sentence = word_tokenizer(word_data)

    k = Embedder(
        '144/', batch_size=32)

    embedding_vectors = {}
    # word_index = {'the': 0, 'are': 1, 'is': 2}
    test_word = [[]]

    for word, index in word_index.items():
        test_word[0].append(word)

    diction = k.sents2elmo(test_word)

    for word, index in word_index.items():
        embedding_vectors[word] = list(diction[0][index])
        temp = np.asarray(embedding_vectors[word])

    savecsv(language, embedding_vectors)
Esempio n. 29
0
class WordRep(nn.Module):
    def __init__(self, data, opt):
        super(WordRep, self).__init__()

        self.gpu = opt.gpu
        self.batch_size = opt.batch_size

        self.use_elmo = False
        if opt.elmo:
            logging.info("use elmo, loading ...")
            self.use_elmo = True
            self.elmo = Embedder(data.config['elmo_path'])
            # we project the elmo representation to the same dim of char embedding
            self.elmo_projection = nn.Linear(
                self.elmo.config['encoder']['projection_dim'] * 2,
                opt.char_hidden_dim, False)
            self.elmo_drop = nn.Dropout(opt.dropout)
        else:
            self.char_hidden_dim = opt.char_hidden_dim
            self.char_embedding_dim = opt.char_emb_dim
            self.char_feature = CharCNN(data.char_alphabet.size(), None,
                                        self.char_embedding_dim,
                                        self.char_hidden_dim, opt.dropout,
                                        self.gpu)

        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(opt.dropout)
        self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                           self.embedding_dim)
        if data.pretrain_word_embedding is not None:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(data.pretrain_word_embedding))
        else:
            self.word_embedding.weight.data.copy_(
                torch.from_numpy(
                    self.random_embedding(data.word_alphabet.size(),
                                          self.embedding_dim)))

        if data.feat_config is not None:
            self.feature_num = len(data.feature_alphabets)
            self.feature_embedding_dims = data.feature_emb_dims
            self.feature_embeddings = nn.ModuleList()
            for idx in range(self.feature_num):
                emb = nn.Embedding(data.feature_alphabets[idx].size(),
                                   self.feature_embedding_dims[idx])
                emb.weight.data.copy_(
                    torch.from_numpy(
                        self.random_embedding(
                            data.feature_alphabets[idx].size(),
                            self.feature_embedding_dims[idx])))
                self.feature_embeddings.append(emb)
        else:
            self.feature_num = 0

        if opt.gpu >= 0 and torch.cuda.is_available():
            self.drop = self.drop.cuda(self.gpu)
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            if data.feat_config is not None:
                for idx in range(self.feature_num):
                    self.feature_embeddings[idx] = self.feature_embeddings[
                        idx].cuda(self.gpu)

            if opt.elmo:
                self.elmo_projection = self.elmo_projection.cuda(self.gpu)
                self.elmo_drop = self.elmo_drop.cuda(self.gpu)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.zeros([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale,
                                                       [1, embedding_dim])
        return pretrain_emb

    def forward(self, word_inputs, word_seq_lengths, char_inputs,
                char_seq_lengths, char_seq_recover, feature_inputs,
                text_inputs):
        """
            input:
                word_inputs: (batch_size, sent_len)
                features: list [(batch_size, sent_len), (batch_len, sent_len),...]
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output: 
                Variable(batch_size, sent_len, hidden_dim)
        """
        batch_size = word_inputs.size(0)
        sent_len = word_inputs.size(1)
        word_embs = self.word_embedding(word_inputs)
        word_list = [word_embs]
        for idx in range(self.feature_num):
            word_list.append(self.feature_embeddings[idx](feature_inputs[idx]))

        if self.use_elmo:
            with torch.no_grad():
                elmo_rep = torch.from_numpy(
                    np.array(self.elmo.sents2elmo(
                        text_inputs)))  # batch, seq_len, 1024
                if self.gpu >= 0 and torch.cuda.is_available():
                    elmo_rep = elmo_rep.cuda(self.gpu)

            char_features = self.elmo_drop(self.elmo_projection(elmo_rep))
            # char_features = elmo_rep

        else:

            char_features = self.char_feature.get_last_hiddens(
                char_inputs,
                char_seq_lengths.cpu().numpy())
            char_features = char_features[char_seq_recover]
            char_features = char_features.view(batch_size, sent_len, -1)

        word_list.append(char_features)

        word_embs = torch.cat(word_list, 2)
        word_represent = self.drop(word_embs)
        return word_represent
Esempio n. 30
0
from elmoformanylangs import Embedder

e = Embedder('./elmo_chinese')

sents = [['今', '天', '天气', '真', '好', '阿'],
         ['潮水', '退', '了', '就', '知道', '谁', '沒', '起床']]

output = e.sents2elmo(
    sents
)  # will return a list of numpy arrays each with the shape=(seq_len, embedding_size)

print(output)