def generate_data_set(test_cats=my_cats, subset='train'): dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'), categories=[cat])['data'], tokens_only=True, bFastText=False, bRemoveStopWords=True))\ for cat in test_cats] corpus = [doc for categroy_list in dataset_list for doc in categroy_list] categories_lengths = [len(cat_liste) for cat_liste in dataset_list] categories = [[k for _ in range(0, length)] for k, length in enumerate(categories_lengths)] cats = [cat for elem_list in categories for cat in elem_list] y = np.array(cats) np.savetxt('ELmo_20news_group_rep\\y_train_reduced.csv', y[range(0, len(cats), 50)], delimiter=",") print('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus))) e = Embedder('..\\PreTrainedElmo_EN', batch_size=64) try: os.remove('ELmo_20news_group_rep\\X_train_reduced.csv') except: pass with open('ELmo_20news_group_rep\\X_train_reduced.csv', mode='a') as myFile: # for i in range(0,2): for i in range(0, len(corpus), 50): em = np.mean(e.sents2elmo([corpus[i]])[0], axis=0) # print (em.shape) myFile.write('{}'.format( em.tolist()).strip('[').strip(']').replace(' ', '')) myFile.write('\n')
def trainELMoToFile(self, elmoModelPath, filename, words, vocab_size): """Build ELMo word embeddings elmoModelPath: the absolute path from the model to process* filename: filename to save word embeddings words: word list (vocabulary) vocab_size: length of word list (vocabulary) * visit to https://github.com/HIT-SCIR/ELMoForManyLangs#downloads to download Spanish model and check set ups. """ from elmoformanylangs import Embedder import numpy as np e = Embedder( elmoModelPath ) # path from loaded model (e.g. /home/user/project/ELMo.es/) embedding_matrix = np.zeros((vocab_size, 1024)) for i, word in enumerate(words): aux_elmo = e.sents2elmo([[word]]) with open(filename, 'a') as g: strnums = [str(num) for num in aux_elmo[0][0].tolist()] strnums = ' '.join(strnums) g.write("{} {}\n".format(word, strnums)) # print ("Processing \t{} of {}...".format( i+1, len(words)) ) g.close()
def get_elmo_vector(vocab: list) -> np.ndarray: from elmoformanylangs import Embedder e = Embedder('{}/zhs.model'.format(DATA_DIR)) vectors = np.zeros((len(vocab), 1024)) vectors[2:] = e.sents2elmo(vocab[2:]) scio.savemat('{}/elmo.mat'.format(DATA_DIR), {'vectors': vectors}) return vectors
class ElmoNer(nn.Module): def __init__(self, num_units, rnn_hidden, num_tags, num_layers=1, use_cuda=False): super(ElmoNer, self).__init__() self.use_cuda = use_cuda self.embedding = Embedder(ELMO_PRETAIN_PATH) self.rnn = nn.GRU(num_units, rnn_hidden, num_layers=num_layers, batch_first=True, bidirectional=True) self.linear = nn.Linear(2 * rnn_hidden, num_tags) # self.linear = nn.Linear(num_units, num_tags) self.crf = CRF(num_tags) def forward(self, x_data, y_data, masks): """ 前向算法 :param x_data: :param y_data: :param masks: :return: """ encoded_layers = self.embedding.sents2elmo(x_data) out = self.rnn_layer(encoded_layers) loss = -1 * self.crf(out, y_data.transpose(0, 1), masks.transpose( 0, 1)) return loss def rnn_layer(self, encoded_layers): """ batch seq_len hidden :param encoded_layers: :return: batch seq_len class """ encoded_layers = np.array(encoded_layers) encoded_layers = torch.from_numpy(encoded_layers) if self.use_cuda: encoded_layers = encoded_layers.cuda() out, _ = self.rnn(encoded_layers) out = self.linear(out) out = out.transpose(0, 1) return out def test(self, x_data, masks): encoded_layers = self.embedding.sents2elmo(x_data) out = self.rnn_layer(encoded_layers) best_paths = self.crf.decode(out, mask=masks.transpose(0, 1)) return best_paths
class embed(object): def __init__(self, source, path): self.source = source self.path = path self._load_model() def _aravec(self, path): self.model = gensim.models.Word2Vec.load(path) self.vector_size = self.model.vector_size def _fasttext(self, path): self.model = fasttext.load_model(path) self.vector_size = self.model.get_dimension() def _elmo(self, path): self.model = Embedder(path, 64) def _load_model(self): if self.source == "aravec": self._aravec(self.path) elif self.source == "fasttext": self._fasttext(self.path) elif self.source == "elmo": self._elmo(self.path) else: raise ValueError("Model not supported. Please select either aravec, fasttext or elmo") def _embed_single(self, text, max_len): if self.source == "aravec": embedding = [self.model.wv[i].reshape(-1, self.vector_size) for i in text.split() if i in self.model.wv] if len(embedding) == 0: return self._pad(np.zeros((1, self.vector_size)), max_len) embedding = np.concatenate(embedding, axis=0) return self._pad(embedding, max_len) if self.source == "fasttext": embedding = [self.model.get_word_vector(i).reshape(-1, self.vector_size) for i in text.split()] embedding = np.concatenate(embedding, axis=0) return self._pad(embedding, max_len) def embed_batch(self, text_list, max_len): if self.source == "elmo": input_segmented = [i.split() for i in text_list] embedding = self.model.sents2elmo(input_segmented) embedding = [self._pad(i, max_len) for i in embedding] return np.concatenate(embedding, axis=0) else: batch = [self._embed_single(i, max_len) for i in text_list] return np.concatenate(batch) def _pad(self, array, max_len): if array.shape[0] >= max_len: return np.expand_dims(array[:max_len],0) else: padding_size = max_len - array.shape[0] return np.expand_dims(np.pad(array, [(0, padding_size), (0, 0)], mode='constant', constant_values=0), 0)
def __init__(self, data_path, max_seq_len=None, lang='en'): df = pd.read_csv(data_path, sep='\t') df = df.set_index('id') tokenizer = TweetTokenizer() self.labels = df[['HS', 'TR', 'AG']].as_matrix() self.text = [tokenizer.tokenize(text) for text in df['text']] self.max_seq_len = max_seq_len if lang == 'en': self.elmo = Embedder('elmo/english/') else lang == 'es': self.elmo = Embedder('elmo/spanish/')
class ELMOTagger(BaseModel): def __init__(self, config, dataset): super().__init__(config) self.dataset = dataset self.output_size = len(dataset.vocabs.pos) self.embedder = Embedder(self.config.elmo_model, batch_size=self.config.batch_size) self.elmo_layer = self.config.elmo_layer if hasattr(self.config, 'lstm_size'): self.lstm = nn.LSTM(1024, self.config.lstm_size, batch_first=True, dropout=self.config.dropout, num_layers=self.config.lstm_num_layers, bidirectional=True) hidden_size = self.config.lstm_size * 2 else: self.lstm = None hidden_size = 1024 if self.elmo_layer == 'weighted_sum': self.elmo_weights = nn.Parameter(torch.ones(3, dtype=torch.float)) self.output_proj = nn.Linear(hidden_size, self.output_size) # ignore <pad> = 3 self.criterion = nn.CrossEntropyLoss( ignore_index=self.dataset.vocabs.pos['<pad>']) def compute_loss(self, batch, output): target = to_cuda(torch.LongTensor(batch.pos)) return compute_sequence_loss(target, output, self.criterion) def forward(self, batch): batch_size = len(batch[0]) if self.elmo_layer == 'mean': embedded = self.embedder.sents2elmo(batch.sentence, -1) embedded = np.stack(embedded) embedded = to_cuda(torch.from_numpy(embedded)) elif self.elmo_layer == 'weighted_sum': embedded = self.embedder.sents2elmo(batch.sentence, -2) embedded = np.stack(embedded) embedded = to_cuda(torch.from_numpy(embedded)) embedded = (self.elmo_weights[None, :, None, None] * embedded).sum(1) else: embedded = self.embedder.sents2elmo(batch.sentence, self.elmo_layer) embedded = np.stack(embedded) embedded = to_cuda(torch.from_numpy(embedded)) if self.lstm: embedded = self.lstm(embedded)[0] return self.output_proj(embedded)
def __init__(self, lang="fr"): self.name = "elmo" if lang == "fr": from elmoformanylangs import Embedder self.e = Embedder('/home/bmazoyer/Dev/ELMoForManyLangs/150', batch_size=32) self.vectors = None elif lang == "en": import tensorflow as tf import tensorflow_hub as hub self.embed = hub.Module("https://tfhub.dev/google/elmo/2") self.session = tf.Session() self.session.run(tf.global_variables_initializer()) self.session.run(tf.tables_initializer()) self.lang = lang
def __init__(self, max_p_num, max_p_len, max_q_len, train_files=[], dev_files=[], test_files=[], vocab=Vocab(lower=True)): self.logger = logging.getLogger("brc") self.max_p_num = max_p_num self.max_p_len = max_p_len self.max_q_len = max_q_len # ************************** self.do = 1 # 是否使用bert模型得到隐藏层作为词向量输入 self.do_bert = False self.do_elmo = True if self.do_bert: self.bc = BertClient() self.vocab = vocab # if train_files: # self._load_batch_size_data_set('train', train_files[0], train=True) # if dev_files: # self._load_batch_size_data_set('dev', train_files[0]) # if test_files: # self._load_batch_size_data_set('test', train_files[0]) if self.do_elmo: self.bc = Embedder('/tmp/ELMoForManyLangs/zhs.model') self.vocab = vocab # if train_files: # self._load_batch_size_data_set('train', train_files[0], train=True) # if dev_files: # self._load_batch_size_data_set('dev', train_files[0]) # if test_files: # self._load_batch_size_data_set('test', train_files[0]) # ************************** self.train_set, self.dev_set, self.test_set = [], [], [] # list 可以进行相加 += if not self.do_bert and not self.do_elmo: print('****************************') if train_files: # 把训练集的内容放到 self.train_set for train_file in train_files: self.train_set += self._load_dataset(train_file, train=True) self.logger.info('Train set size: {} questions.'.format(len(self.train_set))) if dev_files: for dev_file in dev_files: self.dev_set += self._load_dataset(dev_file) self.logger.info('Dev set size: {} questions.'.format(len(self.dev_set))) if test_files: for test_file in test_files: self.test_set += self._load_dataset(test_file) self.logger.info('Test set size: {} questions.'.format(len(self.test_set)))
def test(): import numpy as np e = Embedder(ELMO_PRETAIN_PATH) text = [['今', '天', '天', '气', '真', '好', '阿'], ['你', '吃', '饭', '了', '吗', '?', 'd']] a = e.sents2elmo(text) a = np.array(a) print(a.shape) print(a) new_text = [chs_to_cht(line) for line in text] b = e.sents2elmo(new_text) print(b[0].shape, b[1].shape) print(b)
def elmo_data(samples): from elmoformanylangs import Embedder elmo = Embedder('data/elmo_model', batch_size=1) data = SimpleNamespace() data.x = elmo.sents2elmo([sample.text for sample in samples]) data.x = np.array([ np.mean(datum, axis=0) for datum in data.x ]) data.y = np.array([sample.label for sample in samples]) return data
def test_elmoformanylangs(): e = Embedder('/Users/feili/project/ELMoForManyLangs/output/en') # e = Embedder('/Users/feili/resource/data_to_train_emb/elmo_ade_lower_0norm_200d') sents = [['LABA', ',', 'such', 'as', 'vilanterol']] # for idx, sent in enumerate(sents): # for idy, tk in enumerate(sent): # sents[idx][idy] = sents[idx][idy].lower() ret = e.sents2elmo(sents) # will return a list of numpy arrays # each with the shape=(seq_len, embedding_size) pass
def BuildDataSet(subset='train', output = -1): dataset_list = [list(read_corpus(fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'), categories=[cat])['data'], tokens_only=True, bFastText=False, bRemoveStopWords=True))\ for cat in my_cats] corpus = [doc for categroy_list in dataset_list for doc in categroy_list ] categories_lengths=[len(cat_liste) for cat_liste in dataset_list] categories = [[k for _ in range(0,length)] for k,length in enumerate(categories_lengths)] cats = [cat for elem_list in categories for cat in elem_list] y = np.array(cats) np.savetxt('ELmo_20news_group_rep\\y_{}.csv'.format(subset), y, delimiter=",") print ('raw corpus ELMO Rep {} size {}'.format(subset, len(corpus))) e = Embedder('..\\PreTrainedElmo_EN', batch_size=64) try: os.remove('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset)) except: pass t0 = time() np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) for i in range(0,len(corpus)//3)]) np.savetxt('ELmo_20news_group_rep\\X_3L_{}1.csv'.format(subset), np_em, delimiter=',') print (f'finished generated 1st chunk of elmo reps in {time() - t0} seconds') try: os.remove('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset)) except: pass t0 = time() np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) for i in range(len(corpus)//3, 2*(len(corpus)//3))]) np.savetxt('ELmo_20news_group_rep\\X_3L_{}2.csv'.format(subset), np_em, delimiter=',') print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds') try: os.remove('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset)) except: pass t0 = time() np_em= np.vstack([np.mean(np.array(e.sents2elmo([corpus[i]], output_layer=-2)[0]), axis=1).reshape(-1) for i in range(2*(len(corpus)//3),len(corpus))]) np.savetxt('ELmo_20news_group_rep\\X_3L_{}3.csv'.format(subset), np_em, delimiter=',') print (f'finished generated 2nd chunk of elmo reps in {time() - t0} seconds')
class PretrainEmbedder(ElmoEmbedding): def __init__(self, model_dir, batch_size=64): super(PretrainEmbedder, self).__init__() self.embedder = Embedder(model_dir, batch_size) def predict(self, sentences, layer_index=-1): return self.embedder.sents2elmo(sentences, layer_index)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.args.feat in ('char', 'bert', 'elmo'): self.WORD, self.FEAT = self.transform.FORM else: self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)]).to(self.args.device) if self.args.elmo_options: self.elmo = ElmoEmbedder(self.args.elmo_options, self.args.elmo_weights, -1) else: self.efml = EFML(self.args.elmo_weights) self.elmo = False #print(self.__dict__) if self.args.map_method == 'vecmap': self.mapper = Vecmap(vars(self.args)) elif self.args.map_method == 'elmogan': self.mapper = Elmogan(vars(self.args)) elif self.args.map_method == 'muse': self.mapper = Muse(vars(self.args)) else: self.mapper = None
class Elmo: def __init__(self, lang="fr"): self.name = "elmo" if lang == "fr": from elmoformanylangs import Embedder self.e = Embedder('/home/bmazoyer/Dev/ELMoForManyLangs/150', batch_size=32) self.vectors = None elif lang == "en": import tensorflow as tf import tensorflow_hub as hub self.embed = hub.Module("https://tfhub.dev/google/elmo/2") self.session = tf.Session() self.session.run(tf.global_variables_initializer()) self.session.run(tf.tables_initializer()) self.lang = lang def populate_array(self, data): logging.info(data.name) self.vectors[data.name] = np.mean(np.array(self.e.sents2elmo([data.text.split()]))[0], axis=0) def compute_vectors(self, data): n = data.shape[0] self.vectors = np.zeros((n, 1024)) if self.lang == "fr": data.apply(self.populate_array, axis=1) return self.vectors elif self.lang == "en": batch_size = 64 for i in tqdm(range(0, n, batch_size)): self.vectors[i:min(n, i + batch_size)] = self.session.run( self.embed(data.text[i:min(n, i + batch_size)].tolist(), signature="default", as_dict=True)[ "default"] ) return self.vectors
class WordEmbeddings(): """ ELMo https://allennlp.org/elmo """ def __init__(self, model_path=r'../auxiliary_data/zhs.model/', cuda_device=0): self.cuda_device = cuda_device self.elmo = Embedder(model_path) def get_tokenized_words_embeddings(self, sents_tokened): """ @see EmbeddingDistributor :param tokenized_sents: list of tokenized words string (sentences/phrases) :return: ndarray with shape (len(sents), dimension of embeddings) """ max_len = max([len(sent) for sent in sents_tokened]) elmo_embedding = self.elmo.sents2elmo(sents_tokened, output_layer=-2) elmo_embedding = [ np.pad(emb, pad_width=((0, 0), (0, max_len - emb.shape[1]), (0, 0)), mode='constant') for emb in elmo_embedding ] elmo_embedding = torch.from_numpy(np.array(elmo_embedding)) return elmo_embedding # if __name__ == '__main__': # sents = [['今', '天', '天气', '真', '好', '啊'], # ['潮水', '退', '了', '就', '知道', '谁', '没', '穿', '裤子']] # elmo = WordEmbeddings() # embs = elmo.get_tokenized_words_embeddings(sents) # print("OK")
def loadElmo(embfile): """ :param embfile: :return: """ model = Embedder(embfile) return model
def __init__(self, num_units, rnn_hidden, num_tags, num_layers=1, use_cuda=False): super(ElmoNer, self).__init__() self.use_cuda = use_cuda self.embedding = Embedder(ELMO_PRETAIN_PATH) self.rnn = nn.GRU(num_units, rnn_hidden, num_layers=num_layers, batch_first=True, bidirectional=True) self.linear = nn.Linear(2 * rnn_hidden, num_tags) # self.linear = nn.Linear(num_units, num_tags) self.crf = CRF(num_tags)
def __init__(self, emb_dim, h_dim, n_labels, v_size, gpu=True, v_vec=None, batch_first=True, emb_type=None, elmo_model_dir=None): super(BiLSTM, self).__init__() self.gpu = gpu self.h_dim = h_dim if self.h_dim is None: self.h_dim = emb_dim + 36 if emb_type == 'ELMo': options_file = f'{elmo_model_dir}/options.json' weight_file = f'{elmo_model_dir}/weights.hdf5' self.word_embed = Elmo(options_file, weight_file, num_output_representations=1, dropout=0) if gpu: self.word_embed = self.word_embed.cuda() elif emb_type == 'ELMoForManyLangs': from elmoformanylangs import Embedder e = Embedder(elmo_model_dir) self.word_embed = e.sents2elmo elif emb_type == 'None': self.word_embed = None else: self.word_embed = nn.Embedding(v_size, emb_dim, padding_idx=0) if v_vec is not None: v_vec = torch.tensor(v_vec) self.word_embed.weight.data.copy_(v_vec) feature_embed_layers = [] feature_embed_size = { "feature:0": 25, "feature:1": 26, "feature:2": 12, "feature:3": 6, "feature:4": 94, "feature:5": 32 } for key in feature_embed_size: size = feature_embed_size[key] feature_embed = nn.Embedding(size, 5, padding_idx=0) feature_embed.weight.data[0] = torch.zeros(5) feature_embed_layers.append(feature_embed) self.feature_embed_layers = nn.ModuleList(feature_embed_layers) self.drop_target = nn.Dropout(p=0.2) self.lstm = nn.LSTM(input_size=emb_dim + 36, hidden_size=self.h_dim, batch_first=batch_first, bidirectional=True) self.l1 = nn.Linear(self.h_dim * 2, n_labels)
class ElmoModel(BaseModel): def __init__(self, path_to_model): self._embedder = Embedder(path_to_model) def process(self, sentences): return [ np.mean(embeds, axis=0) for embeds in self._embedder.sents2elmo(sentences) ]
def trans_elmo(dic): e = Embedder('pretrained/') sents = [] seg = Wordseg(batch_size=8, device="cuda:0", embedding='elmo', elmo_use_cuda=False, mode="TW") for key in dic: sents.append(dic[key]) sents = seg.cut(sents) vector = e.sents2elmo(sents, -1) print(len(vector)) i = 0 for key in dic: dic[key] = vector[i][0] i += 1 return dic
class Foreign_Elmo(): def __init__(self, dir_name, embedding_source, device=None): self.embedding_source = embedding_source self.device = device print('loading the embedder') self.e = Embedder(dir_name) print('finish loading the embedder') def _get_embeddings(self, sent_lst): # if self.embedding_source == 'elmo_0': # type = self.e.sents2elmo(sent_lst, 0) # type = [torch.tensor(x).unsqueeze(1) for x in type] # return type, type # else: full_type = [] full_token = [] for batch_ in sent_lst: # print(len(batch_), len(batch_[0])) result = torch.tensor(self.e.sents2elmo(batch_, -2)) # print(result.shape) type_ = result[:, 0, :, :] if self.embedding_source == 'elmo_1': index = 1 elif self.embedding_source == 'elmo_2': index = 2 elif self.embedding_source == 'elmo_0': index = 0 token_ = result[:, index, :, :] full_type.append(type_) full_token.append(token_) return full_token, full_type def get_part_elmo(self, batch_): result = torch.tensor(self.e.sents2elmo(batch_, -2)) # print(result.shape) if self.embedding_source == 'elmo_1': index = 1 elif self.embedding_source == 'elmo_2': index = 2 elif self.embedding_source == 'elmo_0': index = 0 token_ = result[:, index, :, :] return token_
def generate_batch_data(inputfile, batch_size, args): elmo = Embedder(args.weights) if args.mat0: W0 = {} W1 = {} W2 = {} mapmat = np.load(args.mat0) W0['src'] = mapmat['wx2'] W0['trg'] = mapmat['wz2'] W0['s'] = mapmat['s'] mapmat = np.load(args.mat1) W1['src'] = mapmat['wx2'] W1['trg'] = mapmat['wz2'] W1['s'] = mapmat['s'] mapmat = np.load(args.mat2) W2['src'] = mapmat['wx2'] W2['trg'] = mapmat['wz2'] W2['s'] = mapmat['s'] mapmat = None xlingual = [W0, W1, W2] else: xlingual = [ False, ] * 3 while True: # it needs to be infinitely iterable x, y = load_data(inputfile) print("INPUT SIZES X AND Y", len(x), len(y)) assert len(x) == len(y) newxval = [] yval = [] for i in range(len(y)): newxval.append(x[i]) yval.append(y[i]) assert len(newxval) == len(yval) if i > 0 and i % batch_size == 0: xval0, xval1, xval2 = embed_efml(newxval, elmo, xlingual, lang=args.trlang) ypadded = pad_labels(yval) yield ([np.array(xval0), np.array(xval1), np.array(xval2)], np.array(ypadded)) newxval = [] yval = [] if len(newxval) > 0: xval0, xval1, xval2 = embed_efml(newxval, elmo, xlingual, lang=args.trlang) ypadded = pad_labels(yval) yield ([np.array(xval0), np.array(xval1), np.array(xval2)], np.array(ypadded))
def __init__(self, **kwargs): super().__init__() self.elmo_embedding = Embedder(kwargs["elmo_model_path"], batch_size=kwargs["batch_size"]) self.lstm = nn.LSTM( input_size=kwargs["elmo_output_dim"], hidden_size=kwargs["lstm_hidden_dim"], num_layers=kwargs["lstm_layers"], dropout=kwargs["dropout"] if kwargs["lstm_layers"] > 1 else 0, bidirectional=kwargs["bidirectional"]) self.output = nn.Linear(in_features=2 * kwargs["lstm_hidden_dim"], out_features=kwargs["output_size"]) self.dropout = nn.Dropout(kwargs["dropout"]) self.idx2word = kwargs["idx2word"] self.pad_idx = kwargs["pad_idx"]
def elmo(language): if language not in languages: raise AttributeError("Required language not in list: {}".format( languages.keys())) r = request.json sentences = [sent.split(' ') for sent in r['sentences']] if languages[language] is None: languages[language] = Embedder( os.path.join(os.path.dirname(os.path.abspath(__file__)), paths[language])) result = languages[language].sents2elmo(sentences) return jsonify({"embeddings": [r.tolist() for r in result]})
def get_elmo_embedding(elmo_floder, token_file, sentence_vector_file): file_data = open(token_file, 'r', encoding="UTF-8") data_lines = file_data.readlines() batch_size = 32 file_data.close() # 按照一句一个向量的标准保存 将一个单句存到一行 原代码返回的特征值是要相加的 return q1_features + q2_features sentences = [] for line in data_lines: sentences.append(line.strip().split(" ")) print("句子数量: ", len(sentences)) elmo = Embedder(elmo_floder) elmo_vectors = [] for i in tqdm(range(int(len(sentences) / batch_size) + 1)): sentences_curr = sentences[i * batch_size:i * batch_size + batch_size] embedding = elmo.sents2elmo(sentences_curr, output_layer=-1) # 1024维度 elmo_vectors += embedding assert len(sentences) == len( elmo_vectors), "len(data_lines) != len(elmo_vectors)" print("len(elmo_vectors): ", len(elmo_vectors)) output_file = open(sentence_vector_file, 'wb', encoding='UTF-8') pk.dump(elmo_vectors, output_file) output_file.close()
def classify(): y = np.loadtxt('ELmo_20news_group_rep\\y_train_reduced.csv', delimiter=',') X = np.loadtxt('ELmo_20news_group_rep\\X_train_reduced.csv', delimiter=',') centroids = [['car', 'engine', 'drive', 'speed'], ['religion', 'jesus', 'god', 'believe', 'heaven', 'sin'], [ 'baseball', 'player', 'run', 'sport', 'hit', 'bat', 'rotation' ], ['electronics', 'conductive', 'power', 'resistor', 'circuit'], ['medical', 'methodology', 'science', 'molecule', 'virus']] e = Embedder('..\\PreTrainedElmo_EN', batch_size=64) em_vecs = [ np.mean(e.sents2elmo(cat_taxo)[0], axis=0) for cat_taxo in centroids ] # X_train = np.loadtxt('custom_doc2vec_data\\X_train.csv', delimiter=',') # y_train = np.loadtxt('custom_doc2vec_data\\y_train.csv', delimiter=',') # dist = met.pairwise_distances(X= X_train,Y=list_centroids_vectors[0].reshape(1, -1),metric='cosine') dist = met.pairwise_distances(X=X, Y=np.vstack(em_vecs), metric='cosine') print(dist.shape) indexes = np.argmin(dist, axis=1) diff_list = (indexes - y).tolist() diff = [1 if d == 0 else 0 for d in diff_list] print('taxonomy-based semi supervised classification accuracy : {}'.format( sum(diff) / len(diff))) plt.plot(indexes) plt.figure() plt.plot(y) plt.show()
def __init__(self, config, dataset): super().__init__(config) self.dataset = dataset self.output_size = len(dataset.vocabs.pos) self.embedder = Embedder(self.config.elmo_model, batch_size=self.config.batch_size) self.elmo_layer = self.config.elmo_layer if hasattr(self.config, 'lstm_size'): self.lstm = nn.LSTM(1024, self.config.lstm_size, batch_first=True, dropout=self.config.dropout, num_layers=self.config.lstm_num_layers, bidirectional=True) hidden_size = self.config.lstm_size * 2 else: self.lstm = None hidden_size = 1024 if self.elmo_layer == 'weighted_sum': self.elmo_weights = nn.Parameter(torch.ones(3, dtype=torch.float)) self.output_proj = nn.Linear(hidden_size, self.output_size) # ignore <pad> = 3 self.criterion = nn.CrossEntropyLoss( ignore_index=self.dataset.vocabs.pos['<pad>'])
def test_with_real_embedder(self): model = ElmoModel( max_len=50, fasttext_model=None, elmo_embedder=Embedder("models/elmo/es/"), **self.model_args, ) X = ["Esto no es agresivo", "Esto sí es agresivo"] y = np.array([0, 1]).reshape(-1, 1) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X, y, epochs=2)