class ElmoNer(nn.Module): def __init__(self, num_units, rnn_hidden, num_tags, num_layers=1, use_cuda=False): super(ElmoNer, self).__init__() self.use_cuda = use_cuda self.embedding = Embedder(ELMO_PRETAIN_PATH) self.rnn = nn.GRU(num_units, rnn_hidden, num_layers=num_layers, batch_first=True, bidirectional=True) self.linear = nn.Linear(2 * rnn_hidden, num_tags) # self.linear = nn.Linear(num_units, num_tags) self.crf = CRF(num_tags) def forward(self, x_data, y_data, masks): """ 前向算法 :param x_data: :param y_data: :param masks: :return: """ encoded_layers = self.embedding.sents2elmo(x_data) out = self.rnn_layer(encoded_layers) loss = -1 * self.crf(out, y_data.transpose(0, 1), masks.transpose( 0, 1)) return loss def rnn_layer(self, encoded_layers): """ batch seq_len hidden :param encoded_layers: :return: batch seq_len class """ encoded_layers = np.array(encoded_layers) encoded_layers = torch.from_numpy(encoded_layers) if self.use_cuda: encoded_layers = encoded_layers.cuda() out, _ = self.rnn(encoded_layers) out = self.linear(out) out = out.transpose(0, 1) return out def test(self, x_data, masks): encoded_layers = self.embedding.sents2elmo(x_data) out = self.rnn_layer(encoded_layers) best_paths = self.crf.decode(out, mask=masks.transpose(0, 1)) return best_paths
class ELMOTagger(BaseModel): def __init__(self, config, dataset): super().__init__(config) self.dataset = dataset self.output_size = len(dataset.vocabs.pos) self.embedder = Embedder(self.config.elmo_model, batch_size=self.config.batch_size) self.elmo_layer = self.config.elmo_layer if hasattr(self.config, 'lstm_size'): self.lstm = nn.LSTM(1024, self.config.lstm_size, batch_first=True, dropout=self.config.dropout, num_layers=self.config.lstm_num_layers, bidirectional=True) hidden_size = self.config.lstm_size * 2 else: self.lstm = None hidden_size = 1024 if self.elmo_layer == 'weighted_sum': self.elmo_weights = nn.Parameter(torch.ones(3, dtype=torch.float)) self.output_proj = nn.Linear(hidden_size, self.output_size) # ignore <pad> = 3 self.criterion = nn.CrossEntropyLoss( ignore_index=self.dataset.vocabs.pos['<pad>']) def compute_loss(self, batch, output): target = to_cuda(torch.LongTensor(batch.pos)) return compute_sequence_loss(target, output, self.criterion) def forward(self, batch): batch_size = len(batch[0]) if self.elmo_layer == 'mean': embedded = self.embedder.sents2elmo(batch.sentence, -1) embedded = np.stack(embedded) embedded = to_cuda(torch.from_numpy(embedded)) elif self.elmo_layer == 'weighted_sum': embedded = self.embedder.sents2elmo(batch.sentence, -2) embedded = np.stack(embedded) embedded = to_cuda(torch.from_numpy(embedded)) embedded = (self.elmo_weights[None, :, None, None] * embedded).sum(1) else: embedded = self.embedder.sents2elmo(batch.sentence, self.elmo_layer) embedded = np.stack(embedded) embedded = to_cuda(torch.from_numpy(embedded)) if self.lstm: embedded = self.lstm(embedded)[0] return self.output_proj(embedded)
def test(): import numpy as np e = Embedder(ELMO_PRETAIN_PATH) text = [['今', '天', '天', '气', '真', '好', '阿'], ['你', '吃', '饭', '了', '吗', '?', 'd']] a = e.sents2elmo(text) a = np.array(a) print(a.shape) print(a) new_text = [chs_to_cht(line) for line in text] b = e.sents2elmo(new_text) print(b[0].shape, b[1].shape) print(b)
def BuildDataSet(subset='train', output = -1): dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'), categories=[cat])['data'], tokens_only=True, bFastText=False, bRemoveStopWords=True))\ for cat in my_cats] corpus = [doc for categroy_list in dataset_list for doc in categroy_list ] categories_lengths=[len(cat_liste) for cat_liste in dataset_list] categories = [[k for _ in range(0,length)] for k,length in enumerate(categories_lengths)] cats = [cat for elem_list in categories for cat in elem_list] y = np.array(cats) np.savetxt('ELmo_20news_group_rep\\y_{}.csv'.format(subset), y, delimiter=",") print ('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus))) e = Embedder('..\\PreTrainedElmo_EN', batch_size=64) try: os.remove('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset)) except: pass t0 = time() np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) for i in range(0,len(corpus)//3)]) np.savetxt('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset), np_em, delimiter=',') print (f'finished generated 1st chunk of elmo reps in {time() - t0} seconds') try: os.remove('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset)) except: pass t0 = time() np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) for i in range(len(corpus)//3, 2*(len(corpus)//3))]) np.savetxt('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset), np_em, delimiter=',') print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds') try: os.remove('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset)) except: pass t0 = time() np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) for i in range(2*(len(corpus)//3),len(corpus))]) np.savetxt('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset), np_em, delimiter=',') print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds')
def trainELMoToFile(self, elmoModelPath, filename, words, vocab_size): """Build ELMo word embeddings elmoModelPath: the absolute path from the model to process* filename: filename to save word embeddings words: word list (vocabulary) vocab_size: length of word list (vocabulary) * visit to https://github.com/HIT-SCIR/ELMoForManyLangs#downloads to download Spanish model and check set ups. """ from elmoformanylangs import Embedder import numpy as np e = Embedder( elmoModelPath ) # path from loaded model (e.g. /home/user/project/ELMo.es/) embedding_matrix = np.zeros((vocab_size, 1024)) for i, word in enumerate(words): aux_elmo = e.sents2elmo([[word]]) with open(filename, 'a') as g: strnums = [str(num) for num in aux_elmo[0][0].tolist()] strnums = ' '.join(strnums) g.write("{} {}\n".format(word, strnums)) # print ("Processing \t{} of {}...".format( i+1, len(words)) ) g.close()
class Elmo: def __init__(self, lang="fr"): self.name = "elmo" if lang == "fr": from elmoformanylangs import Embedder self.e = Embedder('/home/bmazoyer/Dev/ELMoForManyLangs/150', batch_size=32) self.vectors = None elif lang == "en": import tensorflow as tf import tensorflow_hub as hub self.embed = hub.Module("https://tfhub.dev/google/elmo/2") self.session = tf.Session() self.session.run(tf.global_variables_initializer()) self.session.run(tf.tables_initializer()) self.lang = lang def populate_array(self, data): logging.info(data.name) self.vectors[data.name] = np.mean(np.array(self.e.sents2elmo([data.text.split()]))[0], axis=0) def compute_vectors(self, data): n = data.shape[0] self.vectors = np.zeros((n, 1024)) if self.lang == "fr": data.apply(self.populate_array, axis=1) return self.vectors elif self.lang == "en": batch_size = 64 for i in tqdm(range(0, n, batch_size)): self.vectors[i:min(n, i + batch_size)] = self.session.run( self.embed(data.text[i:min(n, i + batch_size)].tolist(), signature="default", as_dict=True)[ "default"] ) return self.vectors
class PretrainEmbedder(ElmoEmbedding): def __init__(self, model_dir, batch_size=64): super(PretrainEmbedder, self).__init__() self.embedder = Embedder(model_dir, batch_size) def predict(self, sentences, layer_index=-1): return self.embedder.sents2elmo(sentences, layer_index)
def generate_data_set(test_cats=my_cats, subset='train'): dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'), categories=[cat])['data'], tokens_only=True, bFastText=False, bRemoveStopWords=True))\ for cat in test_cats] corpus = [doc for categroy_list in dataset_list for doc in categroy_list] categories_lengths = [len(cat_liste) for cat_liste in dataset_list] categories = [[k for _ in range(0, length)] for k, length in enumerate(categories_lengths)] cats = [cat for elem_list in categories for cat in elem_list] y = np.array(cats) np.savetxt('ELmo_20news_group_rep\\y_train_reduced.csv', y[range(0, len(cats), 50)], delimiter=",") print('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus))) e = Embedder('..\\PreTrainedElmo_EN', batch_size=64) try: os.remove('ELmo_20news_group_rep\\X_train_reduced.csv') except: pass with open('ELmo_20news_group_rep\\X_train_reduced.csv', mode='a') as myFile: # for i in range(0,2): for i in range(0, len(corpus), 50): em = np.mean(e.sents2elmo([corpus[i]])[0], axis=0) # print (em.shape) myFile.write('{}'.format( em.tolist()).strip('[').strip(']').replace(' ', '')) myFile.write('\n')
class WordEmbeddings(): """ ELMo https://allennlp.org/elmo """ def __init__(self, model_path=r'../auxiliary_data/zhs.model/', cuda_device=0): self.cuda_device = cuda_device self.elmo = Embedder(model_path) def get_tokenized_words_embeddings(self, sents_tokened): """ @see EmbeddingDistributor :param tokenized_sents: list of tokenized words string (sentences/phrases) :return: ndarray with shape (len(sents), dimension of embeddings) """ max_len = max([len(sent) for sent in sents_tokened]) elmo_embedding = self.elmo.sents2elmo(sents_tokened, output_layer=-2) elmo_embedding = [ np.pad(emb, pad_width=((0, 0), (0, max_len - emb.shape[1]), (0, 0)), mode='constant') for emb in elmo_embedding ] elmo_embedding = torch.from_numpy(np.array(elmo_embedding)) return elmo_embedding # if __name__ == '__main__': # sents = [['今', '天', '天气', '真', '好', '啊'], # ['潮水', '退', '了', '就', '知道', '谁', '没', '穿', '裤子']] # elmo = WordEmbeddings() # embs = elmo.get_tokenized_words_embeddings(sents) # print("OK")
class ElmoModel(BaseModel): def __init__(self, path_to_model): self._embedder = Embedder(path_to_model) def process(self, sentences): return [ np.mean(embeds, axis=0) for embeds in self._embedder.sents2elmo(sentences) ]
def get_elmo_vector(vocab: list) -> np.ndarray: from elmoformanylangs import Embedder e = Embedder('{}/zhs.model'.format(DATA_DIR)) vectors = np.zeros((len(vocab), 1024)) vectors[2:] = e.sents2elmo(vocab[2:]) scio.savemat('{}/elmo.mat'.format(DATA_DIR), {'vectors': vectors}) return vectors
class Foreign_Elmo(): def __init__(self, dir_name, embedding_source, device=None): self.embedding_source = embedding_source self.device = device print('loading the embedder') self.e = Embedder(dir_name) print('finish loading the embedder') def _get_embeddings(self, sent_lst): # if self.embedding_source == 'elmo_0': # type = self.e.sents2elmo(sent_lst, 0) # type = [torch.tensor(x).unsqueeze(1) for x in type] # return type, type # else: full_type = [] full_token = [] for batch_ in sent_lst: # print(len(batch_), len(batch_[0])) result = torch.tensor(self.e.sents2elmo(batch_, -2)) # print(result.shape) type_ = result[:, 0, :, :] if self.embedding_source == 'elmo_1': index = 1 elif self.embedding_source == 'elmo_2': index = 2 elif self.embedding_source == 'elmo_0': index = 0 token_ = result[:, index, :, :] full_type.append(type_) full_token.append(token_) return full_token, full_type def get_part_elmo(self, batch_): result = torch.tensor(self.e.sents2elmo(batch_, -2)) # print(result.shape) if self.embedding_source == 'elmo_1': index = 1 elif self.embedding_source == 'elmo_2': index = 2 elif self.embedding_source == 'elmo_0': index = 0 token_ = result[:, index, :, :] return token_
class embed(object): def __init__(self, source, path): self.source = source self.path = path self._load_model() def _aravec(self, path): self.model = gensim.models.Word2Vec.load(path) self.vector_size = self.model.vector_size def _fasttext(self, path): self.model = fasttext.load_model(path) self.vector_size = self.model.get_dimension() def _elmo(self, path): self.model = Embedder(path, 64) def _load_model(self): if self.source == "aravec": self._aravec(self.path) elif self.source == "fasttext": self._fasttext(self.path) elif self.source == "elmo": self._elmo(self.path) else: raise ValueError("Model not supported. Please select either aravec, fasttext or elmo") def _embed_single(self, text, max_len): if self.source == "aravec": embedding = [self.model.wv[i].reshape(-1, self.vector_size) for i in text.split() if i in self.model.wv] if len(embedding) == 0: return self._pad(np.zeros((1, self.vector_size)), max_len) embedding = np.concatenate(embedding, axis=0) return self._pad(embedding, max_len) if self.source == "fasttext": embedding = [self.model.get_word_vector(i).reshape(-1, self.vector_size) for i in text.split()] embedding = np.concatenate(embedding, axis=0) return self._pad(embedding, max_len) def embed_batch(self, text_list, max_len): if self.source == "elmo": input_segmented = [i.split() for i in text_list] embedding = self.model.sents2elmo(input_segmented) embedding = [self._pad(i, max_len) for i in embedding] return np.concatenate(embedding, axis=0) else: batch = [self._embed_single(i, max_len) for i in text_list] return np.concatenate(batch) def _pad(self, array, max_len): if array.shape[0] >= max_len: return np.expand_dims(array[:max_len],0) else: padding_size = max_len - array.shape[0] return np.expand_dims(np.pad(array, [(0, padding_size), (0, 0)], mode='constant', constant_values=0), 0)
def elmo_data(samples): from elmoformanylangs import Embedder elmo = Embedder('data/elmo_model', batch_size=1) data = SimpleNamespace() data.x = elmo.sents2elmo([sample.text for sample in samples]) data.x = np.array([ np.mean(datum, axis=0) for datum in data.x ]) data.y = np.array([sample.label for sample in samples]) return data
def test_elmoformanylangs(): e = Embedder('/Users/feili/project/ELMoForManyLangs/output/en') # e = Embedder('/Users/feili/resource/data_to_train_emb/elmo_ade_lower_0norm_200d') sents = [['LABA', ',', 'such', 'as', 'vilanterol']] # for idx, sent in enumerate(sents): # for idy, tk in enumerate(sent): # sents[idx][idy] = sents[idx][idy].lower() ret = e.sents2elmo(sents) # will return a list of numpy arrays # each with the shape=(seq_len, embedding_size) pass
def load_elmo(all_tokens, language=""): # here, we will load ELMO. # For Chinese + Vietnamese, we use pretrained https://github.com/HIT-SCIR/ELMoForManyLangs all_tokens = list(set(all_tokens)) current_word2idx = {all_tokens[i]: i + 4 for i in range(len(all_tokens))} current_word2idx["unk"] = 1 current_word2idx["<SOS>"] = 2 current_word2idx["<EOS>"] = 3 weights = [] elmo = ElmoEmbedder() if language == "zh": e = Embedder('179') weights = e.sents2elmo(all_tokens) weights = preprocess_weights(weights) # so this basically returns a 3x 1024 for any words that are # in the token_embedding # and nan for any words not in the embedding elif language == "vi": pdb.set_trace() e = Embedder('178') weights = e.sents2elmo(all_tokens) weights = preprocess_weights(weights) else: weights = elmo.embed_sentence(all_tokens) weights = np.mean(weights, axis=0) # take the average final_weights = [] final_weights.append([0] * EMBED_DIM) unknown_vector = list(np.random.normal(size=(EMBED_DIM, ))) start_vector = list(np.random.normal(size=(EMBED_DIM, ))) end_vector = list(np.random.normal(size=(EMBED_DIM, ))) final_weights.append(unknown_vector) final_weights.append(start_vector) final_weights.append(end_vector) final_weights.extend(weights) return current_word2idx, final_weights
class ElmoEncoder(BaseTextEncoder): is_trained = True batch_size = 64 def __init__(self, model_dir: str, pooling_layer: int = -1, pooling_strategy: str = 'REDUCE_MEAN', *args, **kwargs): super().__init__(*args, **kwargs) self.model_dir = model_dir if pooling_layer > 2: raise ValueError('pooling_layer = %d is not supported now!' % pooling_layer) self.pooling_layer = pooling_layer self.pooling_strategy = pooling_strategy def post_init(self): from elmoformanylangs import Embedder from ...helper import Tokenizer self._elmo = Embedder(model_dir=self.model_dir, batch_size=self.batch_size) self.cn_tokenizer = Tokenizer() @batching def encode(self, text: List[str], *args, **kwargs) -> np.ndarray: # tokenize text batch_tokens = [self.cn_tokenizer.tokenize(sent) for sent in text] elmo_encodes = self._elmo.sents2elmo(batch_tokens, output_layer=-2) pooled_data = [] for token_encodes in elmo_encodes: if self.pooling_layer == -1: _layer_data = np.average(token_encodes, axis=0) elif self.pooling_layer >= 0: _layer_data = token_encodes[self.pooling_layer] else: raise ValueError('pooling_layer = %d is not supported now!' % self.pooling_layer) _pooled = pooling_np(_layer_data, self.pooling_strategy) pooled_data.append(_pooled) return np.array(pooled_data, dtype=np.float32)
class ELMo_Model(): # follow the below repo to download the data # https://github.com/HIT-SCIR/ELMoForManyLangs def __init__(self): self.model = Embedder('elmo_model/') def similarity(self, word1: str, word2: str): if word1 is None or word2 is None: return None else: try: vec_word1 = self.model.sents2elmo([[word1]])[0] vec_word2 = self.model.sents2elmo([[word2]])[0] cos_similarity = cosine_similarity(vec_word1,vec_word2)[0][0] except KeyError: cos_similarity = 0 return cos_similarity
class PhraseNeighbors: def __init__(self, model_dir): self.faiss_db = faiss.read_index(os.path.join(model_dir, 'faiss_db')) with open(os.path.join(model_dir, 'faiss_lookup.json')) as f: self.faiss_lookup = json.loads(f.read()) self.elmo = Embedder(os.path.join(model_dir, 'elmo_nl')) self.tokenizer = Tokenizer() def query(self, s, topn): s = [w.value for w in self.tokenizer.tokenize(s)] X = self.elmo.sents2elmo([s]) X = np.array([x.mean(axis=0) for x in X]) distances, indices = self.faiss_db.search(X, k=topn) return [(self.faiss_lookup[i], d) for i, d in zip(indices[0], distances[0])]
class elmo_embedding_layer(nn.Module): def __init__(self, vocab): super(elmo_embedding_layer, self).__init__() self.embed = Embedder('./embed_module/elmo/chinese') self.vocab = vocab def forward(self, x): ''' 保证调用时代码的一致性,这里的输入为batch_sz x seq_len的tensor 步骤: 1.使用vocab转换为sentence list 2.forward :param x:tensor [batch_sz x seq_len] :return:tensor [batch_sz x seq_len x dim] ''' seq_batch = [[self.vocab[i] for i in seq] for seq in x] seq_embed = self.embed.sents2elmo(seq_batch) return torch.Tensor(seq_embed)
def trans_elmo(dic): e = Embedder('pretrained/') sents = [] seg = Wordseg(batch_size=8, device="cuda:0", embedding='elmo', elmo_use_cuda=False, mode="TW") for key in dic: sents.append(dic[key]) sents = seg.cut(sents) vector = e.sents2elmo(sents, -1) print(len(vector)) i = 0 for key in dic: dic[key] = vector[i][0] i += 1 return dic
class ElmoEmbeddings: def __init__( self, modelPath="/home/joan/Escritorio/ELMoForManyLangs-master/EN/"): self.elmo = Embedder(modelPath) def getSentenceVector(self, sentence): vecs = self.elmo.sents2elmo(sentence) vectorList = [] for vec in vecs: vectorList.append(vec[0].tolist()) return vectorList def getCentroid(self, vectors): avgVector = np.mean(vectors, axis=0) return avgVector.tolist() def distance(self, A, B, distance="cosine"): return cdist([A], [B], distance)
class WordEmbeddings(): """ ELMo https://allennlp.org/elmo """ def __init__(self, model_path=r'../auxiliary_data/zhs.model/', cuda_device=0): self.cuda_device = cuda_device self.elmo = Embedder(model_path) def get_tokenized_words_embeddings(self, sents_tokened): """ @see EmbeddingDistributor :param tokenized_sents: list of tokenized words string (sentences/phrases) :return: ndarray with shape (len(sents), dimension of embeddings) """ elmo_embedding = self.elmo.sents2elmo( sents_tokened, output_layer=-2) #list(tensor), list长度为句子的长度 # elmo_embedding = torch.from_numpy(np.array(elmo_embedding)) return elmo_embedding
def get_elmo_embedding(elmo_floder, token_file, sentence_vector_file): file_data = open(token_file, 'r', encoding="UTF-8") data_lines = file_data.readlines() batch_size = 32 file_data.close() # 按照一句一个向量的标准保存 将一个单句存到一行 原代码返回的特征值是要相加的 return q1_features + q2_features sentences = [] for line in data_lines: sentences.append(line.strip().split(" ")) print("句子数量: ", len(sentences)) elmo = Embedder(elmo_floder) elmo_vectors = [] for i in tqdm(range(int(len(sentences) / batch_size) + 1)): sentences_curr = sentences[i * batch_size:i * batch_size + batch_size] embedding = elmo.sents2elmo(sentences_curr, output_layer=-1) # 1024维度 elmo_vectors += embedding assert len(sentences) == len( elmo_vectors), "len(data_lines) != len(elmo_vectors)" print("len(elmo_vectors): ", len(elmo_vectors)) output_file = open(sentence_vector_file, 'wb', encoding='UTF-8') pk.dump(elmo_vectors, output_file) output_file.close()
def classify(): y = np.loadtxt('ELmo_20news_group_rep\\y_train_reduced.csv', delimiter=',') X = np.loadtxt('ELmo_20news_group_rep\\X_train_reduced.csv', delimiter=',') centroids = [['car', 'engine', 'drive', 'speed'], ['religion', 'jesus', 'god', 'believe', 'heaven', 'sin'], [ 'baseball', 'player', 'run', 'sport', 'hit', 'bat', 'rotation' ], ['electronics', 'conductive', 'power', 'resistor', 'circuit'], ['medical', 'methodology', 'science', 'molecule', 'virus']] e = Embedder('..\\PreTrainedElmo_EN', batch_size=64) em_vecs = [ np.mean(e.sents2elmo(cat_taxo)[0], axis=0) for cat_taxo in centroids ] # X_train = np.loadtxt('custom_doc2vec_data\\X_train.csv', delimiter=',') # y_train = np.loadtxt('custom_doc2vec_data\\y_train.csv', delimiter=',') # dist = met.pairwise_distances(X= X_train,Y=list_centroids_vectors[0].reshape(1, -1),metric='cosine') dist = met.pairwise_distances(X=X, Y=np.vstack(em_vecs), metric='cosine') print(dist.shape) indexes = np.argmin(dist, axis=1) diff_list = (indexes - y).tolist() diff = [1 if d == 0 else 0 for d in diff_list] print('taxonomy-based semi supervised classification accuracy : {}'.format( sum(diff) / len(diff))) plt.plot(indexes) plt.figure() plt.plot(y) plt.show()
class BiaffineDependencyParser(Parser): r""" The implementation of Biaffine Dependency Parser. References: - Timothy Dozat and Christopher D. Manning. 2017. `Deep Biaffine Attention for Neural Dependency Parsing`_. .. _Deep Biaffine Attention for Neural Dependency Parsing: https://openreview.net/forum?id=Hk95PK9le """ NAME = 'biaffine-dependency' MODEL = BiaffineDependencyModel def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.args.feat in ('char', 'bert', 'elmo'): self.WORD, self.FEAT = self.transform.FORM else: self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)]).to(self.args.device) if self.args.elmo_options: self.elmo = ElmoEmbedder(self.args.elmo_options, self.args.elmo_weights, -1) else: self.efml = EFML(self.args.elmo_weights) self.elmo = False #print(self.__dict__) if self.args.map_method == 'vecmap': self.mapper = Vecmap(vars(self.args)) elif self.args.map_method == 'elmogan': self.mapper = Elmogan(vars(self.args)) elif self.args.map_method == 'muse': self.mapper = Muse(vars(self.args)) else: self.mapper = None def train(self, train, dev, test, buckets=32, batch_size=5000, punct=False, tree=False, proj=False, partial=False, verbose=True, **kwargs): r""" Args: train/dev/test (list[list] or str): Filenames of the train/dev/test datasets. buckets (int): The number of buckets that sentences are assigned to. Default: 32. batch_size (int): The number of tokens in each batch. Default: 5000. punct (bool): If ``False``, ignores the punctuations during evaluation. Default: ``False``. tree (bool): If ``True``, ensures to output well-formed trees. Default: ``False``. proj (bool): If ``True``, ensures to output projective trees. Default: ``False``. partial (bool): ``True`` denotes the trees are partially annotated. Default: ``False``. verbose (bool): If ``True``, increases the output verbosity. Default: ``True``. kwargs (dict): A dict holding the unconsumed arguments that can be used to update the configurations for training. """ return super().train(**Config().update(locals())) def evaluate(self, data, buckets=8, batch_size=5000, punct=False, tree=True, proj=False, partial=False, verbose=True, **kwargs): r""" Args: data (str): The data for evaluation, both list of instances and filename are allowed. buckets (int): The number of buckets that sentences are assigned to. Default: 32. batch_size (int): The number of tokens in each batch. Default: 5000. punct (bool): If ``False``, ignores the punctuations during evaluation. Default: ``False``. tree (bool): If ``True``, ensures to output well-formed trees. Default: ``False``. proj (bool): If ``True``, ensures to output projective trees. Default: ``False``. partial (bool): ``True`` denotes the trees are partially annotated. Default: ``False``. verbose (bool): If ``True``, increases the output verbosity. Default: ``True``. kwargs (dict): A dict holding the unconsumed arguments that can be used to update the configurations for evaluation. Returns: The loss scalar and evaluation results. """ if kwargs['elmo_options']: self.elmo = ElmoEmbedder(kwargs['elmo_options'], kwargs['elmo_weights'], -1) else: self.efml = EFML(kwargs['elmo_weights']) if kwargs['map_method'] == 'vecmap': self.mapper = Vecmap(kwargs) #print(self.mapper) elif kwargs['map_method'] == 'elmogan': self.mapper = Elmogan(kwargs) #print(self.mapper) elif kwargs['map_method'] == 'muse': self.mapper = Muse(kwargs) else: self.mapper = None return super().evaluate(**Config().update(locals())) def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, tree=True, proj=False, verbose=True, **kwargs): r""" Args: data (list[list] or str): The data for prediction, both a list of instances and filename are allowed. pred (str): If specified, the predicted results will be saved to the file. Default: ``None``. buckets (int): The number of buckets that sentences are assigned to. Default: 32. batch_size (int): The number of tokens in each batch. Default: 5000. prob (bool): If ``True``, outputs the probabilities. Default: ``False``. tree (bool): If ``True``, ensures to output well-formed trees. Default: ``False``. proj (bool): If ``True``, ensures to output projective trees. Default: ``False``. verbose (bool): If ``True``, increases the output verbosity. Default: ``True``. kwargs (dict): A dict holding the unconsumed arguments that can be used to update the configurations for prediction. Returns: A :class:`~supar.utils.Dataset` object that stores the predicted results. """ if kwargs['elmo_options']: self.elmo = ElmoEmbedder(kwargs['elmo_options'], kwargs['elmo_weights'], -1) else: self.efml = EFML(kwargs['elmo_weights']) if kwargs['map_method'] == 'vecmap': self.mapper = Vecmap(kwargs) #print(self.mapper) elif kwargs['map_method'] == 'elmogan': self.mapper = Elmogan(kwargs) #print(self.mapper) elif kwargs['map_method'] == 'muse': self.mapper = Muse(kwargs) else: self.mapper = None return super().predict(**Config().update(locals())) def _train(self, loader): self.model.train() bar, metric = progress_bar(loader), AttachmentMetric() # words, feats, etc. come from loader! loader is train.loader, where train is Dataset for words, feats, arcs, rels in bar: self.optimizer.zero_grad() if self.elmo: feat_embs = self.elmo.embed_batch(feats) else: feat_embs = self.efml.sents2elmo(feats, output_layer=-2) #TODO: dodaj mapping, ce in samo ce gre za vecmap if self.args.map_method == 'vecmap': # map feat_embs with vecmap, actually self.mapper defined in class init feat_embs = self.mapper.map_batch(feat_embs) mask = words.ne(self.WORD.pad_index) # ignore the first token of each sentence mask[:, 0] = 0 feats0 = torch.zeros(words.shape+(1024,)) # words.clone() feats1 = torch.zeros(words.shape+(1024,)) feats2 = torch.zeros(words.shape+(1024,)) # words get ignored, all input comes from feats - 3 elmo layers # still inputting words due to reasons(tm) #feats0 = feats0.unsqueeze(-1) #feats0 = feats0.expand(words.shape+(1024,)) for sentence in range(len(feat_embs)): for token in range(len(feat_embs[sentence][1])): feats0[sentence][token] = torch.Tensor(feat_embs[sentence][0][token]) feats1[sentence][token] = torch.Tensor(feat_embs[sentence][1][token]) feats2[sentence][token] = torch.Tensor(feat_embs[sentence][2][token]) feats = torch.cat((feats0, feats1, feats2), -1) if str(self.args.device) == '-1': feats = feats.to('cpu') else: feats = feats.to('cuda:'+str(self.args.device)) #TODO: fix to allow cpu or gpu s_arc, s_rel = self.model(words, feats) #INFO: here is the data input, y = model(x) loss = self.model.loss(s_arc, s_rel, arcs, rels, mask, self.args.partial) loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), self.args.clip) self.optimizer.step() self.scheduler.step() arc_preds, rel_preds = self.model.decode(s_arc, s_rel, mask) if self.args.partial: mask &= arcs.ge(0) # ignore all punctuation if not specified if not self.args.punct: mask &= words.unsqueeze(-1).ne(self.puncts).all(-1) metric(arc_preds, rel_preds, arcs, rels, mask) bar.set_postfix_str(f"lr: {self.scheduler.get_last_lr()[0]:.4e} - loss: {loss:.4f} - {metric}") @torch.no_grad() def _evaluate(self, loader): print("called _evaluate function") print(self.mapper) self.model.eval() total_loss, metric = 0, AttachmentMetric() for words, feats, arcs, rels in loader: if self.elmo: feat_embs0 = self.elmo.embed_batch(feats) else: feat_embs0 = self.efml.sents2elmo(feats, output_layer=-2) if self.mapper: # map feat_embs with self.mapper defined in class init feat_embs = self.mapper.map_batch(feat_embs0) else: feat_embs = feat_embs0 mask = words.ne(self.WORD.pad_index) # ignore the first token of each sentence mask[:, 0] = 0 feats0 = torch.zeros(words.shape+(1024,)) feats1 = torch.zeros(words.shape+(1024,)) feats2 = torch.zeros(words.shape+(1024,)) for sentence in range(len(feat_embs)): for token in range(len(feat_embs[sentence][1])): feats0[sentence][token] = torch.Tensor(feat_embs[sentence][0][token]) feats1[sentence][token] = torch.Tensor(feat_embs[sentence][1][token]) feats2[sentence][token] = torch.Tensor(feat_embs[sentence][2][token]) feats = torch.cat((feats0, feats1, feats2), -1) if str(self.args.device) == '-1': feats = feats.to('cpu') else: feats = feats.to('cuda:'+str(self.args.device)) s_arc, s_rel = self.model(words, feats) loss = self.model.loss(s_arc, s_rel, arcs, rels, mask, self.args.partial) arc_preds, rel_preds = self.model.decode(s_arc, s_rel, mask, self.args.tree, self.args.proj) if self.args.partial: mask &= arcs.ge(0) # ignore all punctuation if not specified if not self.args.punct: mask &= words.unsqueeze(-1).ne(self.puncts).all(-1) total_loss += loss.item() metric(arc_preds, rel_preds, arcs, rels, mask) total_loss /= len(loader) return total_loss, metric @torch.no_grad() def _predict(self, loader): self.model.eval() preds = {} arcs, rels, probs = [], [], [] for words, feats in progress_bar(loader): if self.elmo: feat_embs = self.elmo.embed_batch(feats) else: feat_embs = self.efml.sents2elmo(feats, output_layer=-2) if self.mapper: # map feat_embs with self.mapper defined in class init feat_embs = self.mapper.map_batch(feat_embs) mask = words.ne(self.WORD.pad_index) # ignore the first token of each sentence mask[:, 0] = 0 lens = mask.sum(1).tolist() feats0 = torch.zeros(words.shape+(1024,)) feats1 = torch.zeros(words.shape+(1024,)) feats2 = torch.zeros(words.shape+(1024,)) for sentence in range(len(feat_embs)): for token in range(len(feat_embs[sentence][1])): feats0[sentence][token] = torch.Tensor(feat_embs[sentence][0][token]) feats1[sentence][token] = torch.Tensor(feat_embs[sentence][1][token]) feats2[sentence][token] = torch.Tensor(feat_embs[sentence][2][token]) feats = torch.cat((feats0, feats1, feats2), -1) if str(self.args.device) == '-1': feats = feats.to('cpu') else: feats = feats.to('cuda:'+str(self.args.device)) s_arc, s_rel = self.model(words, feats) arc_preds, rel_preds = self.model.decode(s_arc, s_rel, mask, self.args.tree, self.args.proj) arcs.extend(arc_preds[mask].split(lens)) rels.extend(rel_preds[mask].split(lens)) if self.args.prob: arc_probs = s_arc.softmax(-1) probs.extend([prob[1:i+1, :i+1].cpu() for i, prob in zip(lens, arc_probs.unbind())]) arcs = [seq.tolist() for seq in arcs] rels = [self.REL.vocab[seq.tolist()] for seq in rels] preds = {'arcs': arcs, 'rels': rels} if self.args.prob: preds['probs'] = probs return preds @classmethod def build(cls, path, min_freq=2, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if args.feat == 'char': FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len) elif args.feat == 'bert': from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) FEAT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.bos_token or tokenizer.cls_token, fix_len=args.fix_len, tokenize=tokenizer.tokenize) FEAT.vocab = tokenizer.get_vocab() elif args.feat == 'elmo': logger.info("Hello, initing ElmoField") FEAT = ElmoField('elmo', bos=bos) # else: FEAT = Field('tags', bos=bos) ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs) REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL) elif args.feat == 'elmo': logger.info("calling CoNLL transform") # FEAT ima se kar 3 layerje, to bo za popravit nekak transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL) else: transform = CoNLL(FORM=WORD, CPOS=FEAT, HEAD=ARC, DEPREL=REL) logger.info("initing train Dataset") train = Dataset(transform, args.train) #WORD.build(train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) logger.info("Building WORD, FEAT, REL fields") WORD.build(train) FEAT.build(train) REL.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_feats': len(FEAT.vocab), 'n_rels': len(REL.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index, 'feat_pad_index': FEAT.pad_index, }) logger.info("Loading model") model = cls.MODEL(**args) model.load_pretrained(WORD.embed).to(args.device) return cls(args, model, transform)
class DataModel(): def __init__(self, opt): super(DataModel, self).__init__() self.opt = opt self.use_gpu = self.opt.use_gpu if self.opt.emb_method == 'elmo': self.init_elmo() elif self.opt.emb_method == 'glove': self.init_glove() elif self.opt.emb_method == 'word2vec': self.init_word2vec() elif self.opt.emb_method == 'elmo_word2vec': self.init_word2vec() self.init_elmo() self.word_dim = self.opt.elmo_dim + self.opt.word2vec_dim elif self.opt.emb_method == 'elmo_glove': self.init_elmo() self.init_glove() self.word_dim = self.opt.elmo_dim + self.opt.glove_dim elif self.opt.emb_method == 'all': self.init_elmo() self.init_glove() self.init_word2vec() self.word_dim = self.opt.elmo_dim + self.opt.glove_dim + self.opt.word2vec_dim def init_elmo(self): ''' initilize the ELMo model ''' self.elmo = Embedder(self.opt.elmo_model, batch_size=self.opt.batch_size) def init_word2vec(self): self.word2vec_word2id = np.load(self.opt.word2vec_w2id_file).tolist() self.word2vec_vocab_size = len(self.word2vec_word2id) self.word2vec = nn.Embedding(self.word2vec_vocab_size, self.opt.word2vec_dim) emb = torch.from_numpy(np.load(self.opt.word2vec_file)) self.word2vec.weight.data.copy_(emb) def init_glove(self): self.glove_word2id = np.load(self.opt.glove_w2id_file).tolist() self.glove_vocab_size = len(self.glove_word2id) self.glove = nn.Embedding(self.glove_vocab_size, self.opt.glove_dim) emb = torch.from_numpy(np.load(self.opt.glove_file)) self.glove.weight.data.copy_(emb) def get_elmo(self, sentence_lists): ''' get the ELMo word embedding vectors for a sentences ''' max_len = max(map(lambda x: len(x), sentence_lists)) sentence_lists = self.elmo.sents2elmo(sentence_lists) sentence_end = [] # 统一长度 for sentence in sentence_lists: sentence = sentence.tolist() for i in range(max_len - len(sentence)): sentence.append([0] * self.opt.elmo_dim) sentence_end.append(sentence) return torch.FloatTensor(sentence_end) def get_word2vec(self, sentence_lists): # 分词之后的 向量 max_len = max(map(lambda x: len(x), sentence_lists)) sentence_lists = list( map( lambda x: list( map(lambda w: self.word2vec_word2id.get(w, 1), x)), sentence_lists)) # 补充 全部是 0 的向量 sentence_lists = list( map(lambda x: x + [0] * (max_len - len(x)), sentence_lists)) sentence_lists = torch.LongTensor(sentence_lists) embeddings = self.word2vec(sentence_lists) return embeddings #分字之后的向量 def get_glove(self, sentence_lists): ''' get the glove word embedding vectors for a sentences ''' max_len = max(map(lambda x: len(x), sentence_lists)) # UNK --> 1 sentence_lists = list( map(lambda x: list(map(lambda w: self.word2id.get(w, 1), x)), sentence_lists)) # 补充全部是 0 的向量 sentence_lists = list( map(lambda x: x + [0] * (max_len - len(x)), sentence_lists)) sentence_lists = torch.LongTensor(sentence_lists) embeddings = self.glove(sentence_lists) return embeddings def get_data(self, x): if self.opt.emb_method == 'elmo': word_embs = self.get_elmo(x) elif self.opt.emb_method == 'glove': word_embs = self.get_glove(x) elif self.opt.emb_method == 'elmo_word2vec': word2vec = self.get_word2vec(x) elmo = self.get_elmo(x) word_embs = torch.cat([elmo, word2vec], -1) elif self.opt.emb_method == 'elmo_glove': glove = self.get_glove(x) elmo = self.get_elmo(x) word_embs = torch.cat([elmo, glove], -1) elif self.opt.emb_method == 'all': glove = self.get_glove(x) word2vec = self.get_word2vec(x) elmo = self.get_elmo(x) word_embs = torch.cat([elmo, glove, word2vec], -1) return word_embs
tokenlist.append(token) tokenlist = tokenlist + test_tokenlist return tokenlist if __name__ == "__main__": filename = 'UD_English-EWT' language = "English-EWT" tokenlist = data_preprocesser(filename) word_data = data_extractor(tokenlist) word_index, sentence = word_tokenizer(word_data) k = Embedder( '144/', batch_size=32) embedding_vectors = {} # word_index = {'the': 0, 'are': 1, 'is': 2} test_word = [[]] for word, index in word_index.items(): test_word[0].append(word) diction = k.sents2elmo(test_word) for word, index in word_index.items(): embedding_vectors[word] = list(diction[0][index]) temp = np.asarray(embedding_vectors[word]) savecsv(language, embedding_vectors)
class WordRep(nn.Module): def __init__(self, data, opt): super(WordRep, self).__init__() self.gpu = opt.gpu self.batch_size = opt.batch_size self.use_elmo = False if opt.elmo: logging.info("use elmo, loading ...") self.use_elmo = True self.elmo = Embedder(data.config['elmo_path']) # we project the elmo representation to the same dim of char embedding self.elmo_projection = nn.Linear( self.elmo.config['encoder']['projection_dim'] * 2, opt.char_hidden_dim, False) self.elmo_drop = nn.Dropout(opt.dropout) else: self.char_hidden_dim = opt.char_hidden_dim self.char_embedding_dim = opt.char_emb_dim self.char_feature = CharCNN(data.char_alphabet.size(), None, self.char_embedding_dim, self.char_hidden_dim, opt.dropout, self.gpu) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(opt.dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) if data.feat_config is not None: self.feature_num = len(data.feature_alphabets) self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): emb = nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]) emb.weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) self.feature_embeddings.append(emb) else: self.feature_num = 0 if opt.gpu >= 0 and torch.cuda.is_available(): self.drop = self.drop.cuda(self.gpu) self.word_embedding = self.word_embedding.cuda(self.gpu) if data.feat_config is not None: for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda(self.gpu) if opt.elmo: self.elmo_projection = self.elmo_projection.cuda(self.gpu) self.elmo_drop = self.elmo_drop.cuda(self.gpu) def random_embedding(self, vocab_size, embedding_dim): pretrain_emb = np.zeros([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, feature_inputs, text_inputs): """ input: word_inputs: (batch_size, sent_len) features: list [(batch_size, sent_len), (batch_len, sent_len),...] word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ batch_size = word_inputs.size(0) sent_len = word_inputs.size(1) word_embs = self.word_embedding(word_inputs) word_list = [word_embs] for idx in range(self.feature_num): word_list.append(self.feature_embeddings[idx](feature_inputs[idx])) if self.use_elmo: with torch.no_grad(): elmo_rep = torch.from_numpy( np.array(self.elmo.sents2elmo( text_inputs))) # batch, seq_len, 1024 if self.gpu >= 0 and torch.cuda.is_available(): elmo_rep = elmo_rep.cuda(self.gpu) char_features = self.elmo_drop(self.elmo_projection(elmo_rep)) # char_features = elmo_rep else: char_features = self.char_feature.get_last_hiddens( char_inputs, char_seq_lengths.cpu().numpy()) char_features = char_features[char_seq_recover] char_features = char_features.view(batch_size, sent_len, -1) word_list.append(char_features) word_embs = torch.cat(word_list, 2) word_represent = self.drop(word_embs) return word_represent
from elmoformanylangs import Embedder e = Embedder('./elmo_chinese') sents = [['今', '天', '天气', '真', '好', '阿'], ['潮水', '退', '了', '就', '知道', '谁', '沒', '起床']] output = e.sents2elmo( sents ) # will return a list of numpy arrays each with the shape=(seq_len, embedding_size) print(output)