Ejemplo n.º 1
0
def generate_data_set(test_cats=my_cats, subset='train'):
    dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset,
                                            remove=('headers', 'footers', 'quotes'),
                                        categories=[cat])['data'],
                                        tokens_only=True,  bFastText=False, bRemoveStopWords=True))\
                            for cat in test_cats]
    corpus = [doc for categroy_list in dataset_list for doc in categroy_list]

    categories_lengths = [len(cat_liste) for cat_liste in dataset_list]
    categories = [[k for _ in range(0, length)]
                  for k, length in enumerate(categories_lengths)]

    cats = [cat for elem_list in categories for cat in elem_list]
    y = np.array(cats)
    np.savetxt('ELmo_20news_group_rep\\y_train_reduced.csv',
               y[range(0, len(cats), 50)],
               delimiter=",")

    print('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus)))
    e = Embedder('..\\PreTrainedElmo_EN', batch_size=64)

    try:
        os.remove('ELmo_20news_group_rep\\X_train_reduced.csv')
    except:
        pass

    with open('ELmo_20news_group_rep\\X_train_reduced.csv',
              mode='a') as myFile:
        # for i in range(0,2):
        for i in range(0, len(corpus), 50):
            em = np.mean(e.sents2elmo([corpus[i]])[0], axis=0)
            # print (em.shape)
            myFile.write('{}'.format(
                em.tolist()).strip('[').strip(']').replace(' ', ''))
            myFile.write('\n')
Ejemplo n.º 2
0
    def trainELMoToFile(self, elmoModelPath, filename, words, vocab_size):
        """Build ELMo word embeddings
		
		elmoModelPath: the absolute path from the model to process*
		filename: filename to save word embeddings
		words: word list (vocabulary)
		vocab_size: length of word list (vocabulary)

		* visit to https://github.com/HIT-SCIR/ELMoForManyLangs#downloads to download Spanish model and check set ups.
		"""
        from elmoformanylangs import Embedder
        import numpy as np
        e = Embedder(
            elmoModelPath
        )  # path from loaded model (e.g. /home/user/project/ELMo.es/)
        embedding_matrix = np.zeros((vocab_size, 1024))

        for i, word in enumerate(words):
            aux_elmo = e.sents2elmo([[word]])
            with open(filename, 'a') as g:
                strnums = [str(num) for num in aux_elmo[0][0].tolist()]
                strnums = ' '.join(strnums)
                g.write("{} {}\n".format(word, strnums))
                # print ("Processing \t{} of {}...".format( i+1, len(words)) )
            g.close()
Ejemplo n.º 3
0
def get_elmo_vector(vocab: list) -> np.ndarray:

    from elmoformanylangs import Embedder
    e = Embedder('{}/zhs.model'.format(DATA_DIR))

    vectors = np.zeros((len(vocab), 1024))
    vectors[2:] = e.sents2elmo(vocab[2:])
    scio.savemat('{}/elmo.mat'.format(DATA_DIR), {'vectors': vectors})
    return vectors
Ejemplo n.º 4
0
class ElmoNer(nn.Module):
    def __init__(self,
                 num_units,
                 rnn_hidden,
                 num_tags,
                 num_layers=1,
                 use_cuda=False):
        super(ElmoNer, self).__init__()
        self.use_cuda = use_cuda
        self.embedding = Embedder(ELMO_PRETAIN_PATH)
        self.rnn = nn.GRU(num_units,
                          rnn_hidden,
                          num_layers=num_layers,
                          batch_first=True,
                          bidirectional=True)
        self.linear = nn.Linear(2 * rnn_hidden, num_tags)
        # self.linear = nn.Linear(num_units, num_tags)
        self.crf = CRF(num_tags)

    def forward(self, x_data, y_data, masks):
        """
		前向算法
		:param x_data:
		:param y_data:
		:param masks:
		:return:
		"""
        encoded_layers = self.embedding.sents2elmo(x_data)
        out = self.rnn_layer(encoded_layers)
        loss = -1 * self.crf(out, y_data.transpose(0, 1), masks.transpose(
            0, 1))
        return loss

    def rnn_layer(self, encoded_layers):
        """
		batch seq_len hidden
		:param encoded_layers:
		:return: batch seq_len class
		"""
        encoded_layers = np.array(encoded_layers)
        encoded_layers = torch.from_numpy(encoded_layers)
        if self.use_cuda:
            encoded_layers = encoded_layers.cuda()
        out, _ = self.rnn(encoded_layers)
        out = self.linear(out)
        out = out.transpose(0, 1)
        return out

    def test(self, x_data, masks):
        encoded_layers = self.embedding.sents2elmo(x_data)

        out = self.rnn_layer(encoded_layers)
        best_paths = self.crf.decode(out, mask=masks.transpose(0, 1))
        return best_paths
Ejemplo n.º 5
0
class embed(object):
    def __init__(self, source, path):
        self.source = source
        self.path = path
        self._load_model()
    def _aravec(self, path):
        self.model = gensim.models.Word2Vec.load(path)
        self.vector_size = self.model.vector_size
    def _fasttext(self, path):
        self.model = fasttext.load_model(path)
        self.vector_size = self.model.get_dimension()
    def _elmo(self, path):
        self.model = Embedder(path, 64)
        
    def _load_model(self):
        if self.source == "aravec":
            self._aravec(self.path)
        elif self.source == "fasttext":
            self._fasttext(self.path)
        elif self.source == "elmo":
            self._elmo(self.path)
        else:
            raise ValueError("Model not supported. Please select either aravec, fasttext or elmo")
   
    def _embed_single(self, text, max_len):
        if self.source == "aravec":
            embedding = [self.model.wv[i].reshape(-1, self.vector_size) for i in text.split() if i in self.model.wv]
            if len(embedding) == 0:
                return self._pad(np.zeros((1, self.vector_size)), max_len)
            embedding = np.concatenate(embedding, axis=0)
            return self._pad(embedding, max_len)
        if self.source == "fasttext":
            embedding = [self.model.get_word_vector(i).reshape(-1, self.vector_size) for i in text.split()]
            embedding = np.concatenate(embedding, axis=0)
            return self._pad(embedding, max_len)
                    
    def embed_batch(self, text_list, max_len):
        if self.source == "elmo":
            input_segmented = [i.split() for i in text_list]
            embedding = self.model.sents2elmo(input_segmented)
            embedding = [self._pad(i, max_len) for i in embedding]            
            return np.concatenate(embedding, axis=0)
        else:    
            batch = [self._embed_single(i, max_len) for i in text_list]
            return np.concatenate(batch)
    
    def _pad(self, array, max_len):
        if array.shape[0] >= max_len:
            return np.expand_dims(array[:max_len],0)
        else:
            padding_size = max_len - array.shape[0]
            return np.expand_dims(np.pad(array, [(0, padding_size), (0, 0)], mode='constant', constant_values=0), 0)
Ejemplo n.º 6
0
    def __init__(self, data_path, max_seq_len=None, lang='en'):
        df = pd.read_csv(data_path, sep='\t')
        df = df.set_index('id')
        
        tokenizer = TweetTokenizer()
        
        self.labels = df[['HS', 'TR', 'AG']].as_matrix()
        self.text = [tokenizer.tokenize(text) for text in df['text']]
        self.max_seq_len = max_seq_len

        if lang == 'en':
            self.elmo = Embedder('elmo/english/')
        else lang == 'es':
            self.elmo = Embedder('elmo/spanish/')
Ejemplo n.º 7
0
class ELMOTagger(BaseModel):
    def __init__(self, config, dataset):
        super().__init__(config)
        self.dataset = dataset
        self.output_size = len(dataset.vocabs.pos)
        self.embedder = Embedder(self.config.elmo_model,
                                 batch_size=self.config.batch_size)
        self.elmo_layer = self.config.elmo_layer
        if hasattr(self.config, 'lstm_size'):
            self.lstm = nn.LSTM(1024,
                                self.config.lstm_size,
                                batch_first=True,
                                dropout=self.config.dropout,
                                num_layers=self.config.lstm_num_layers,
                                bidirectional=True)
            hidden_size = self.config.lstm_size * 2
        else:
            self.lstm = None
            hidden_size = 1024
        if self.elmo_layer == 'weighted_sum':
            self.elmo_weights = nn.Parameter(torch.ones(3, dtype=torch.float))
        self.output_proj = nn.Linear(hidden_size, self.output_size)
        # ignore <pad> = 3
        self.criterion = nn.CrossEntropyLoss(
            ignore_index=self.dataset.vocabs.pos['<pad>'])

    def compute_loss(self, batch, output):
        target = to_cuda(torch.LongTensor(batch.pos))
        return compute_sequence_loss(target, output, self.criterion)

    def forward(self, batch):
        batch_size = len(batch[0])
        if self.elmo_layer == 'mean':
            embedded = self.embedder.sents2elmo(batch.sentence, -1)
            embedded = np.stack(embedded)
            embedded = to_cuda(torch.from_numpy(embedded))
        elif self.elmo_layer == 'weighted_sum':
            embedded = self.embedder.sents2elmo(batch.sentence, -2)
            embedded = np.stack(embedded)
            embedded = to_cuda(torch.from_numpy(embedded))
            embedded = (self.elmo_weights[None, :, None, None] *
                        embedded).sum(1)
        else:
            embedded = self.embedder.sents2elmo(batch.sentence,
                                                self.elmo_layer)
            embedded = np.stack(embedded)
            embedded = to_cuda(torch.from_numpy(embedded))
        if self.lstm:
            embedded = self.lstm(embedded)[0]
        return self.output_proj(embedded)
Ejemplo n.º 8
0
 def __init__(self, lang="fr"):
     self.name = "elmo"
     if lang == "fr":
         from elmoformanylangs import Embedder
         self.e = Embedder('/home/bmazoyer/Dev/ELMoForManyLangs/150', batch_size=32)
         self.vectors = None
     elif lang == "en":
         import tensorflow as tf
         import tensorflow_hub as hub
         self.embed = hub.Module("https://tfhub.dev/google/elmo/2")
         self.session = tf.Session()
         self.session.run(tf.global_variables_initializer())
         self.session.run(tf.tables_initializer())
     self.lang = lang
Ejemplo n.º 9
0
    def __init__(self, max_p_num, max_p_len, max_q_len,
                 train_files=[], dev_files=[], test_files=[], vocab=Vocab(lower=True)):
        self.logger = logging.getLogger("brc")
        self.max_p_num = max_p_num
        self.max_p_len = max_p_len
        self.max_q_len = max_q_len
        # **************************
        self.do = 1
        # 是否使用bert模型得到隐藏层作为词向量输入
        self.do_bert = False
        self.do_elmo = True
        if self.do_bert:
            self.bc = BertClient()
            self.vocab = vocab
            # if train_files:
            #     self._load_batch_size_data_set('train', train_files[0], train=True)
            # if dev_files:
            #     self._load_batch_size_data_set('dev', train_files[0])
            # if test_files:
            #     self._load_batch_size_data_set('test', train_files[0])
        if self.do_elmo:
            self.bc = Embedder('/tmp/ELMoForManyLangs/zhs.model')
            self.vocab = vocab
            # if train_files:
            #     self._load_batch_size_data_set('train', train_files[0], train=True)
            # if dev_files:
            #     self._load_batch_size_data_set('dev', train_files[0])
            # if test_files:
            #     self._load_batch_size_data_set('test', train_files[0])
        # **************************
        self.train_set, self.dev_set, self.test_set = [], [], []
        # list 可以进行相加 +=
        if not self.do_bert and not self.do_elmo:
            print('****************************')
            if train_files:
                # 把训练集的内容放到 self.train_set
                for train_file in train_files:
                    self.train_set += self._load_dataset(train_file, train=True)
                self.logger.info('Train set size: {} questions.'.format(len(self.train_set)))

            if dev_files:
                for dev_file in dev_files:
                    self.dev_set += self._load_dataset(dev_file)
                self.logger.info('Dev set size: {} questions.'.format(len(self.dev_set)))

            if test_files:
                for test_file in test_files:
                    self.test_set += self._load_dataset(test_file)
                self.logger.info('Test set size: {} questions.'.format(len(self.test_set)))
Ejemplo n.º 10
0
def test():
    import numpy as np
    e = Embedder(ELMO_PRETAIN_PATH)
    text = [['今', '天', '天', '气', '真', '好', '阿'],
            ['你', '吃', '饭', '了', '吗', '?', 'd']]
    a = e.sents2elmo(text)
    a = np.array(a)
    print(a.shape)
    print(a)

    new_text = [chs_to_cht(line) for line in text]

    b = e.sents2elmo(new_text)
    print(b[0].shape, b[1].shape)
    print(b)
def elmo_data(samples):

    from elmoformanylangs import Embedder
    elmo = Embedder('data/elmo_model', batch_size=1)

    data = SimpleNamespace()
    data.x = elmo.sents2elmo([sample.text for sample in samples])
    data.x = np.array([
        np.mean(datum, axis=0)
        for datum
        in data.x
    ])
    data.y = np.array([sample.label for sample in samples])

    return data
Ejemplo n.º 12
0
def test_elmoformanylangs():

    e = Embedder('/Users/feili/project/ELMoForManyLangs/output/en')
    # e = Embedder('/Users/feili/resource/data_to_train_emb/elmo_ade_lower_0norm_200d')

    sents = [['LABA', ',', 'such', 'as', 'vilanterol']]
    # for idx, sent in enumerate(sents):
    #     for idy, tk in enumerate(sent):
    #         sents[idx][idy] = sents[idx][idy].lower()

    ret = e.sents2elmo(sents)
    # will return a list of numpy arrays
    # each with the shape=(seq_len, embedding_size)

    pass
Ejemplo n.º 13
0
def BuildDataSet(subset='train', output = -1):
  dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset,
                                          remove=('headers', 'footers', 'quotes'),
                                    categories=[cat])['data'],
                                    tokens_only=True,  bFastText=False, bRemoveStopWords=True))\
                        for cat in my_cats]
  corpus = [doc for categroy_list in dataset_list for doc in categroy_list ]

  categories_lengths=[len(cat_liste) for cat_liste in dataset_list]
  categories = [[k for _ in range(0,length)] for k,length in enumerate(categories_lengths)]
  cats = [cat for elem_list in categories for cat in elem_list]  
  y = np.array(cats)
  np.savetxt('ELmo_20news_group_rep\\y_{}.csv'.format(subset), y, delimiter=",")
  
  print ('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus)))
  e = Embedder('..\\PreTrainedElmo_EN', batch_size=64)
  
  try:
    os.remove('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset))
  except:
    pass
  t0 = time()
  np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) 
      for i in range(0,len(corpus)//3)])
  np.savetxt('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset), np_em, delimiter=',')
  print (f'finished generated 1st chunk of elmo reps in {time() - t0} seconds')


  try:
    os.remove('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset))
  except:
    pass
  t0 = time()
  np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) 
      for i in range(len(corpus)//3, 2*(len(corpus)//3))])
  np.savetxt('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset), np_em, delimiter=',')
  print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds')


  try:
    os.remove('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset))
  except:
    pass
  t0 = time()
  np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) 
      for i in range(2*(len(corpus)//3),len(corpus))])
  np.savetxt('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset), np_em, delimiter=',')
  print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds')
Ejemplo n.º 14
0
class PretrainEmbedder(ElmoEmbedding):
    def __init__(self, model_dir, batch_size=64):
        super(PretrainEmbedder, self).__init__()
        self.embedder = Embedder(model_dir, batch_size)

    def predict(self, sentences, layer_index=-1):
        return self.embedder.sents2elmo(sentences, layer_index)
Ejemplo n.º 15
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if self.args.feat in ('char', 'bert', 'elmo'):
            self.WORD, self.FEAT = self.transform.FORM
        else:
            self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS
        self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL
        self.puncts = torch.tensor([i
                                    for s, i in self.WORD.vocab.stoi.items()
                                    if ispunct(s)]).to(self.args.device)
        if self.args.elmo_options:
            self.elmo = ElmoEmbedder(self.args.elmo_options, self.args.elmo_weights, -1)
        else:
            self.efml = EFML(self.args.elmo_weights)
            self.elmo = False
        #print(self.__dict__)
        if self.args.map_method == 'vecmap':
            self.mapper = Vecmap(vars(self.args))
        elif self.args.map_method == 'elmogan':
            self.mapper = Elmogan(vars(self.args))
        elif self.args.map_method == 'muse':
            self.mapper = Muse(vars(self.args))
        else:
            self.mapper = None
Ejemplo n.º 16
0
class Elmo:

    def __init__(self, lang="fr"):
        self.name = "elmo"
        if lang == "fr":
            from elmoformanylangs import Embedder
            self.e = Embedder('/home/bmazoyer/Dev/ELMoForManyLangs/150', batch_size=32)
            self.vectors = None
        elif lang == "en":
            import tensorflow as tf
            import tensorflow_hub as hub
            self.embed = hub.Module("https://tfhub.dev/google/elmo/2")
            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            self.session.run(tf.tables_initializer())
        self.lang = lang

    def populate_array(self, data):
        logging.info(data.name)
        self.vectors[data.name] = np.mean(np.array(self.e.sents2elmo([data.text.split()]))[0], axis=0)

    def compute_vectors(self, data):
        n = data.shape[0]
        self.vectors = np.zeros((n, 1024))
        if self.lang == "fr":
            data.apply(self.populate_array, axis=1)
            return self.vectors
        elif self.lang == "en":
            batch_size = 64
            for i in tqdm(range(0, n, batch_size)):
                self.vectors[i:min(n, i + batch_size)] = self.session.run(
                    self.embed(data.text[i:min(n, i + batch_size)].tolist(), signature="default", as_dict=True)[
                        "default"]
                )
            return self.vectors
Ejemplo n.º 17
0
class WordEmbeddings():
    """
        ELMo
        https://allennlp.org/elmo

    """
    def __init__(self,
                 model_path=r'../auxiliary_data/zhs.model/',
                 cuda_device=0):
        self.cuda_device = cuda_device
        self.elmo = Embedder(model_path)

    def get_tokenized_words_embeddings(self, sents_tokened):
        """
        @see EmbeddingDistributor
        :param tokenized_sents: list of tokenized words string (sentences/phrases)
        :return: ndarray with shape (len(sents), dimension of embeddings)
        """
        max_len = max([len(sent) for sent in sents_tokened])
        elmo_embedding = self.elmo.sents2elmo(sents_tokened, output_layer=-2)
        elmo_embedding = [
            np.pad(emb,
                   pad_width=((0, 0), (0, max_len - emb.shape[1]), (0, 0)),
                   mode='constant') for emb in elmo_embedding
        ]
        elmo_embedding = torch.from_numpy(np.array(elmo_embedding))
        return elmo_embedding


# if __name__ == '__main__':
#     sents = [['今', '天', '天气', '真', '好', '啊'],
#              ['潮水', '退', '了', '就', '知道', '谁', '没', '穿', '裤子']]
#     elmo = WordEmbeddings()
#     embs = elmo.get_tokenized_words_embeddings(sents)
#     print("OK")
def loadElmo(embfile):
    """

    :param embfile:
    :return:
    """
    model = Embedder(embfile)
    return model
Ejemplo n.º 19
0
 def __init__(self,
              num_units,
              rnn_hidden,
              num_tags,
              num_layers=1,
              use_cuda=False):
     super(ElmoNer, self).__init__()
     self.use_cuda = use_cuda
     self.embedding = Embedder(ELMO_PRETAIN_PATH)
     self.rnn = nn.GRU(num_units,
                       rnn_hidden,
                       num_layers=num_layers,
                       batch_first=True,
                       bidirectional=True)
     self.linear = nn.Linear(2 * rnn_hidden, num_tags)
     # self.linear = nn.Linear(num_units, num_tags)
     self.crf = CRF(num_tags)
    def __init__(self,
                 emb_dim,
                 h_dim,
                 n_labels,
                 v_size,
                 gpu=True,
                 v_vec=None,
                 batch_first=True,
                 emb_type=None,
                 elmo_model_dir=None):
        super(BiLSTM, self).__init__()
        self.gpu = gpu
        self.h_dim = h_dim
        if self.h_dim is None:
            self.h_dim = emb_dim + 36
        if emb_type == 'ELMo':
            options_file = f'{elmo_model_dir}/options.json'
            weight_file = f'{elmo_model_dir}/weights.hdf5'
            self.word_embed = Elmo(options_file,
                                   weight_file,
                                   num_output_representations=1,
                                   dropout=0)
            if gpu:
                self.word_embed = self.word_embed.cuda()
        elif emb_type == 'ELMoForManyLangs':
            from elmoformanylangs import Embedder
            e = Embedder(elmo_model_dir)
            self.word_embed = e.sents2elmo
        elif emb_type == 'None':
            self.word_embed = None
        else:
            self.word_embed = nn.Embedding(v_size, emb_dim, padding_idx=0)
        if v_vec is not None:
            v_vec = torch.tensor(v_vec)
            self.word_embed.weight.data.copy_(v_vec)

        feature_embed_layers = []
        feature_embed_size = {
            "feature:0": 25,
            "feature:1": 26,
            "feature:2": 12,
            "feature:3": 6,
            "feature:4": 94,
            "feature:5": 32
        }
        for key in feature_embed_size:
            size = feature_embed_size[key]
            feature_embed = nn.Embedding(size, 5, padding_idx=0)
            feature_embed.weight.data[0] = torch.zeros(5)
            feature_embed_layers.append(feature_embed)
        self.feature_embed_layers = nn.ModuleList(feature_embed_layers)
        self.drop_target = nn.Dropout(p=0.2)
        self.lstm = nn.LSTM(input_size=emb_dim + 36,
                            hidden_size=self.h_dim,
                            batch_first=batch_first,
                            bidirectional=True)
        self.l1 = nn.Linear(self.h_dim * 2, n_labels)
Ejemplo n.º 21
0
class ElmoModel(BaseModel):
    def __init__(self, path_to_model):
        self._embedder = Embedder(path_to_model)

    def process(self, sentences):
        return [
            np.mean(embeds, axis=0)
            for embeds in self._embedder.sents2elmo(sentences)
        ]
def trans_elmo(dic):
    e = Embedder('pretrained/')
    sents = []
    seg = Wordseg(batch_size=8,
                  device="cuda:0",
                  embedding='elmo',
                  elmo_use_cuda=False,
                  mode="TW")
    for key in dic:
        sents.append(dic[key])
    sents = seg.cut(sents)
    vector = e.sents2elmo(sents, -1)
    print(len(vector))
    i = 0
    for key in dic:
        dic[key] = vector[i][0]
        i += 1
    return dic
Ejemplo n.º 23
0
class Foreign_Elmo():
    def __init__(self, dir_name, embedding_source, device=None):
        self.embedding_source = embedding_source
        self.device = device
        print('loading the embedder')
        self.e = Embedder(dir_name)
        print('finish loading the embedder')

    def _get_embeddings(self, sent_lst):
        # if self.embedding_source  == 'elmo_0':
        #     type = self.e.sents2elmo(sent_lst, 0)
        #     type = [torch.tensor(x).unsqueeze(1) for x in type]
        #     return type, type
        # else:
        full_type = []
        full_token = []
        for batch_ in sent_lst:
            # print(len(batch_), len(batch_[0]))
            result = torch.tensor(self.e.sents2elmo(batch_, -2))
            # print(result.shape)
            type_ = result[:, 0, :, :]
            if self.embedding_source == 'elmo_1':
                index = 1
            elif self.embedding_source == 'elmo_2':
                index = 2
            elif self.embedding_source == 'elmo_0':
                index = 0
            token_ = result[:, index, :, :]
            full_type.append(type_)
            full_token.append(token_)
        return full_token, full_type

    def get_part_elmo(self, batch_):
        result = torch.tensor(self.e.sents2elmo(batch_, -2))
        # print(result.shape)
        if self.embedding_source == 'elmo_1':
            index = 1
        elif self.embedding_source == 'elmo_2':
            index = 2
        elif self.embedding_source == 'elmo_0':
            index = 0
        token_ = result[:, index, :, :]
        return token_
Ejemplo n.º 24
0
 def generate_batch_data(inputfile, batch_size, args):
     elmo = Embedder(args.weights)
     if args.mat0:
         W0 = {}
         W1 = {}
         W2 = {}
         mapmat = np.load(args.mat0)
         W0['src'] = mapmat['wx2']
         W0['trg'] = mapmat['wz2']
         W0['s'] = mapmat['s']
         mapmat = np.load(args.mat1)
         W1['src'] = mapmat['wx2']
         W1['trg'] = mapmat['wz2']
         W1['s'] = mapmat['s']
         mapmat = np.load(args.mat2)
         W2['src'] = mapmat['wx2']
         W2['trg'] = mapmat['wz2']
         W2['s'] = mapmat['s']
         mapmat = None
         xlingual = [W0, W1, W2]
     else:
         xlingual = [
             False,
         ] * 3
     while True:  # it needs to be infinitely iterable
         x, y = load_data(inputfile)
         print("INPUT SIZES X AND Y", len(x), len(y))
         assert len(x) == len(y)
         newxval = []
         yval = []
         for i in range(len(y)):
             newxval.append(x[i])
             yval.append(y[i])
             assert len(newxval) == len(yval)
             if i > 0 and i % batch_size == 0:
                 xval0, xval1, xval2 = embed_efml(newxval,
                                                  elmo,
                                                  xlingual,
                                                  lang=args.trlang)
                 ypadded = pad_labels(yval)
                 yield ([np.array(xval0),
                         np.array(xval1),
                         np.array(xval2)], np.array(ypadded))
                 newxval = []
                 yval = []
         if len(newxval) > 0:
             xval0, xval1, xval2 = embed_efml(newxval,
                                              elmo,
                                              xlingual,
                                              lang=args.trlang)
             ypadded = pad_labels(yval)
             yield ([np.array(xval0),
                     np.array(xval1),
                     np.array(xval2)], np.array(ypadded))
Ejemplo n.º 25
0
    def __init__(self, **kwargs):
        super().__init__()

        self.elmo_embedding = Embedder(kwargs["elmo_model_path"],
                                       batch_size=kwargs["batch_size"])

        self.lstm = nn.LSTM(
            input_size=kwargs["elmo_output_dim"],
            hidden_size=kwargs["lstm_hidden_dim"],
            num_layers=kwargs["lstm_layers"],
            dropout=kwargs["dropout"] if kwargs["lstm_layers"] > 1 else 0,
            bidirectional=kwargs["bidirectional"])

        self.output = nn.Linear(in_features=2 * kwargs["lstm_hidden_dim"],
                                out_features=kwargs["output_size"])

        self.dropout = nn.Dropout(kwargs["dropout"])

        self.idx2word = kwargs["idx2word"]
        self.pad_idx = kwargs["pad_idx"]
Ejemplo n.º 26
0
def elmo(language):
    if language not in languages:
        raise AttributeError("Required language not in list: {}".format(
            languages.keys()))
    r = request.json
    sentences = [sent.split(' ') for sent in r['sentences']]
    if languages[language] is None:
        languages[language] = Embedder(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         paths[language]))
    result = languages[language].sents2elmo(sentences)
    return jsonify({"embeddings": [r.tolist() for r in result]})
Ejemplo n.º 27
0
def get_elmo_embedding(elmo_floder, token_file, sentence_vector_file):
    file_data = open(token_file, 'r', encoding="UTF-8")
    data_lines = file_data.readlines()
    batch_size = 32
    file_data.close()
    # 按照一句一个向量的标准保存 将一个单句存到一行 原代码返回的特征值是要相加的 return q1_features + q2_features
    sentences = []
    for line in data_lines:
        sentences.append(line.strip().split(" "))
    print("句子数量: ", len(sentences))
    elmo = Embedder(elmo_floder)
    elmo_vectors = []
    for i in tqdm(range(int(len(sentences) / batch_size) + 1)):
        sentences_curr = sentences[i * batch_size:i * batch_size + batch_size]
        embedding = elmo.sents2elmo(sentences_curr, output_layer=-1)  # 1024维度
        elmo_vectors += embedding

    assert len(sentences) == len(
        elmo_vectors), "len(data_lines) != len(elmo_vectors)"
    print("len(elmo_vectors): ", len(elmo_vectors))
    output_file = open(sentence_vector_file, 'wb', encoding='UTF-8')
    pk.dump(elmo_vectors, output_file)
    output_file.close()
Ejemplo n.º 28
0
def classify():

    y = np.loadtxt('ELmo_20news_group_rep\\y_train_reduced.csv', delimiter=',')
    X = np.loadtxt('ELmo_20news_group_rep\\X_train_reduced.csv', delimiter=',')

    centroids = [['car', 'engine', 'drive', 'speed'],
                 ['religion', 'jesus', 'god', 'believe', 'heaven', 'sin'],
                 [
                     'baseball', 'player', 'run', 'sport', 'hit', 'bat',
                     'rotation'
                 ],
                 ['electronics', 'conductive', 'power', 'resistor', 'circuit'],
                 ['medical', 'methodology', 'science', 'molecule', 'virus']]

    e = Embedder('..\\PreTrainedElmo_EN', batch_size=64)
    em_vecs = [
        np.mean(e.sents2elmo(cat_taxo)[0], axis=0) for cat_taxo in centroids
    ]

    # X_train = np.loadtxt('custom_doc2vec_data\\X_train.csv', delimiter=',')
    # y_train = np.loadtxt('custom_doc2vec_data\\y_train.csv', delimiter=',')
    # dist = met.pairwise_distances(X= X_train,Y=list_centroids_vectors[0].reshape(1, -1),metric='cosine')
    dist = met.pairwise_distances(X=X, Y=np.vstack(em_vecs), metric='cosine')
    print(dist.shape)
    indexes = np.argmin(dist, axis=1)

    diff_list = (indexes - y).tolist()
    diff = [1 if d == 0 else 0 for d in diff_list]
    print('taxonomy-based semi supervised classification accuracy : {}'.format(
        sum(diff) / len(diff)))

    plt.plot(indexes)

    plt.figure()
    plt.plot(y)

    plt.show()
Ejemplo n.º 29
0
 def __init__(self, config, dataset):
     super().__init__(config)
     self.dataset = dataset
     self.output_size = len(dataset.vocabs.pos)
     self.embedder = Embedder(self.config.elmo_model,
                              batch_size=self.config.batch_size)
     self.elmo_layer = self.config.elmo_layer
     if hasattr(self.config, 'lstm_size'):
         self.lstm = nn.LSTM(1024,
                             self.config.lstm_size,
                             batch_first=True,
                             dropout=self.config.dropout,
                             num_layers=self.config.lstm_num_layers,
                             bidirectional=True)
         hidden_size = self.config.lstm_size * 2
     else:
         self.lstm = None
         hidden_size = 1024
     if self.elmo_layer == 'weighted_sum':
         self.elmo_weights = nn.Parameter(torch.ones(3, dtype=torch.float))
     self.output_proj = nn.Linear(hidden_size, self.output_size)
     # ignore <pad> = 3
     self.criterion = nn.CrossEntropyLoss(
         ignore_index=self.dataset.vocabs.pos['<pad>'])
Ejemplo n.º 30
0
    def test_with_real_embedder(self):
        model = ElmoModel(
            max_len=50,
            fasttext_model=None,
            elmo_embedder=Embedder("models/elmo/es/"),
            **self.model_args,
        )

        X = ["Esto no es agresivo", "Esto sí es agresivo"]
        y = np.array([0, 1]).reshape(-1, 1)

        model.compile(loss='binary_crossentropy', optimizer='adam',
                      metrics=['accuracy'])

        model.fit(X, y, epochs=2)