def main(): args = _parse_args() tsv_path = args.tsv_path embedding = BertEmbeddings('bert-base-cased') sentences = [[]] with open(tsv_path, 'r') as f: for i, l in enumerate(f.readlines()): if l.strip(): token, *_ = l.strip().split('\t') sentences[-1].append(token.lower()) else: sentences.append([]) f_sentences = [Sentence(' '.join(s)) for s in sentences] for s in progressbar.progressbar(f_sentences): embedding.embed(s) for t in s: print('\t'.join(t.embedding.numpy().astype(str))) print() s.clear_embeddings()
class BertPretrained(ModelBase): """ Encapsulates pretrained Bert Embeddings (from Zalando Flair) by conforming to the ModelBase interface. """ def __init__(self, model: Optional[BertEmbeddings] = None): super(BertPretrained, self).__init__() if model is not None: self.model = model else: self.model = BertEmbeddings('bert-base-uncased') def dim(self) -> int: """ The dimensionality of created embeddings. :return: 3072 (for now, #TODO) """ return 3072 def get_word_vector(self, word: str) -> Optional[np.ndarray]: """ Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the purpose of Bert embeddings. Instead, utilize the context as well for more accurate vectorization. In reality, Bert embeddings never return None, even for bogus words. :param word: The word to vectorize. :return: Either the word vector or None. """ dummy_sentence = Sentence(word) self.model.embed(dummy_sentence) return np.array(list(dummy_sentence)[0].embedding) def get_word_vectors(self, words: List[str]) -> List[np.ndarray]: """ Vectorizes the list of words, using pretrained Bert embeddings. These embeddings are context dependent, so this method is preferred over fetching word vectors for single words. :param words: The list of words to vectorize. :return: A list of word vectors. """ sentence = Sentence(' '.join(words)) self.model.embed(sentence) return list( map(lambda token: np.array(token.embedding), list(sentence)) ) def vectorize_context(self, words: List[str]) -> Optional[np.ndarray]: """ Transforms the context into a single vector. May return None in extreme cases, e.g. if |words| is an empty list. :param words: List of tokens describing the context. :return: A single word vector or None. """ return self.mean_of_words(self.get_word_vectors(words))
class SentenceBertEmbedderSensor(SentenceSensor): def __init__(self, *pres): super().__init__(*pres) self.bert_embedding = BertEmbeddings() def forward( self, ) -> Any: self.bert_embedding.embed(self.fetch_value(self.sentence_value)) return None
def get_flair_bert_embeddings(words): # Experimental -- not tested from flair.embeddings import BertEmbeddings bert_embedding = BertEmbeddings('bert-base-multilingual-cased') sentence = Sentence(words) bert_embedding.embed(sentence) return (sentence)
def dump_bert_vecs(df, dump_dir): print("Getting BERT vectors...") embedding = BertEmbeddings('bert-base-uncased') word_counter = defaultdict(int) stop_words = set(stopwords.words('english')) stop_words.add("would") except_counter = 0 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') for index, row in df.iterrows(): if index % 100 == 0: print("Finished sentences: " + str(index) + " out of " + str(len(df))) #all sentences are undercase now line = row["sentence"].lower() sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): tokenized_text = tokenizer.tokenize(sent) if len(tokenized_text) > 512: print('sentence too long for Bert: truncating') sentence = Sentence(' '.join(sent[:512]), use_tokenizer=True) else: sentence = Sentence(sent, use_tokenizer=True) try: embedding.embed(sentence) except Exception as e: except_counter += 1 print("Exception Counter while getting BERT: ", except_counter, sentence_ind, index, e) print(sentence) continue for token_ind, token in enumerate(sentence): word = token.text word = word.translate( str.maketrans('', '', string.punctuation)) if word in stop_words or "/" in word or len(word) == 0: continue word_dump_dir = dump_dir + word os.makedirs(word_dump_dir, exist_ok=True) fname = word_dump_dir + "/" + str( word_counter[word]) + ".pkl" word_counter[word] += 1 vec = token.embedding.cpu().numpy() try: with open(fname, "wb") as handler: pickle.dump(vec, handler) except Exception as e: except_counter += 1 print("Exception Counter while dumping BERT: ", except_counter, sentence_ind, index, word, e)
class BertEmbedding(EmbeddingBase): def __init__(self): self.model = BertEmbeddings( bert_model_or_path="bert-base-multilingual-cased") self.size = 3072 def _get_vector(self, sentence: Sentence) -> np.ndarray: res = np.zeros(self.size, dtype=np.float32) for token in sentence.tokens: vec = np.fromiter(token.embedding.tolist(), dtype=np.float32) vec = vec / np.linalg.norm(vec, ord=2) res += vec res /= len(sentence.tokens) return res def batcher(self, params, batch: List[List[str]]) -> np.ndarray: batch = [ Sentence(" ".join(sent)) if sent != [] else ['.'] for sent in batch ] embeddings = [] sentences = self.model.embed(batch) for sent in sentences: embeddings.append(self._get_vector(sent)) embeddings = np.vstack(embeddings) return embeddings def dim(self) -> int: return self.size
class Bert(nn.Module): def __init__(self, idx2word, device=torch.device('cpu')): super(Bert, self).__init__() self.idx2word = idx2word self.embed_size = sizes["bert"] self.bert = BertEmbeddings('bert-base-uncased', '-2') def proc(self, string): if string == '.': return "[SEP]" if string == "__": return "[MASK]" return string def forward(self, batch): # TODO: fill this in batch_as_words = [[ self.proc(str(self.idx2word[token])) for token in l ] for l in batch.transpose(0, 1).tolist()] batch_as_sentences = [Sentence(' '.join(l)) for l in batch_as_words] embeds = self.bert.embed(batch_as_sentences) embeds = [[token.embedding for token in sentence] for sentence in embeds] return torch.stack([torch.stack(sentence) for sentence in embeds]).transpose(0, 1).cuda()
def get_Bert_embeddings(vocab, dim): from flair.embeddings import BertEmbeddings from flair.data import Sentence _embeddings = np.zeros([len(vocab), dim]) temp = [] for each_word in vocab: temp.append(each_word) sentence = Sentence(' '.join(temp)) embedding = BertEmbeddings() embedding.embed(sentence) for token in sentence: _embeddings[vocab[token.text]] = token.embedding return _embeddings
def get_Bert_embeddings(vocab, dim): from flair.embeddings import BertEmbeddings from flair.data import Sentence _embeddings = np.zeros([len(vocab), dim]) temp = [] for each_word in vocab: temp.append(each_word) sentence = Sentence(' '.join(temp)) embedding = BertEmbeddings() embedding.embed(sentence) for token in sentence: try: _embeddings[vocab[token.text]] = token.embedding except KeyError: log.warning(f'Bad token {token.text} for Bert embedding') return _embeddings
class BertEmbedder: """Embed Bert Embeddings""" def __init__(self, len, emb='en'): """ Args: len (int): max length for the model input lang (str, optional): embedding language. Defaults to 'en'. """ if emb=='en': self.embedder = BertEmbeddings("distilbert-base-uncased") self.MAX_LEN = len def embed_sentence(self, sentence): """This function embed each sentence with BERT embedder Args: sentence (str): raw sentence Returns: np.array: embedded matrix """ flair_sentence = Sentence(sentence) while len(flair_sentence) < self.MAX_LEN: flair_sentence.add_token(Token("__PAD__")) self.embedder.embed(flair_sentence) return np.stack([t.embedding.cpu().numpy() for t in flair_sentence])
def contextualize(df, cluster_dump_dir): def get_cluster(tok_vec, cc): max_sim = -10 max_sim_id = -1 for i, cluster_center in enumerate(cc): sim = cosine_similarity(tok_vec, cluster_center) if sim > max_sim: max_sim = sim max_sim_id = i return max_sim_id print("Contextualizing the corpus..") embedding = BertEmbeddings('bert-base-uncased') stop_words = set(stopwords.words('english')) stop_words.add('would') except_counter = 0 word_cluster = {} for index, row in df.iterrows(): if index % 100 == 0: print("Finished rows: " + str(index) + " out of " + str(len(df))) line = row["sentence"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): sentence = Sentence(sent, use_tokenizer=True) embedding.embed(sentence) for token_ind, token in enumerate(sentence): word = token.text if word in stop_words: continue word_clean = word.translate( str.maketrans('', '', string.punctuation)) if len( word_clean ) == 0 or word_clean in stop_words or "/" in word_clean: continue try: cc = word_cluster[word_clean] except: try: cc = word_cluster[word] except: word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl" word_path = cluster_dump_dir + word + "/cc.pkl" try: with open(word_clean_path, "rb") as handler: cc = pickle.load(handler) word_cluster[word_clean] = cc except: try: with open(word_path, "rb") as handler: cc = pickle.load(handler) word_cluster[word] = cc except Exception as e: except_counter += 1 print( "Exception Counter while getting clusters: ", except_counter, index, e) continue if len(cc) > 1: tok_vec = token.embedding.cpu().numpy() cluster = get_cluster(tok_vec, cc) sentence.tokens[token_ind].text = word + "$" + str( cluster) sentences[sentence_ind] = to_tokenized_string(sentence) df["sentence"][index] = " . ".join(sentences) return df, word_cluster
def get_ent_emb_dict(self, df_ent_final_ranking, only_top_N_entitis = 10, dump_flair_res_to_pickle=False): print("In function: get_ent_emb_dict") os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' #To get rid of this error: OMP: Error #15: Initializing libomp.dylib, but found libiomp5.dylib already initialized. bert_embedding = BertEmbeddings('bert-base-cased')#do_lower_case=False if dump_flair_res_to_pickle: PIK = self.base_dir + "flair_res_embeddings.pkl" print("saving pickle object at: ", PIK) f = open(PIK, "wb") pickle.dump(len(self.df_extractions), f) entities = df_ent_final_ranking["entity"][:only_top_N_entitis] ent_emb_lists = {} ent_has_enough_embs = {} for ind, ent in enumerate(entities): ent_emb_lists[ent] = {"count": 0, "type": df_ent_final_ranking.iloc[ind]["type"], "embeddings": []} ent_has_enough_embs[ent] = False cnt_found_entities = 0 for ind_row, row in self.df_extractions.iterrows(): if ind_row % 500 == 0: print(ind_row) #if ind_row > 1000: # break has_entity = False for ent in entities: if ent_has_enough_embs[ent]: continue if ent in row["sentence"].lower(): has_entity = True break if not has_entity: continue sent_words = [t.text.lower() for t in row["flair_res"]] row_with_embeddings = row["flair_res"] bert_embedding.embed(row_with_embeddings) if dump_flair_res_to_pickle: pickle.dump(row_with_embeddings, f) #sent = self._get_sentence_space_delimited(row) #sent = sent.lower() #sent_words = sent.split(" ") ''' Algo: for every word (w) in sentence, find the embedding for entities that start from w. ''' for ind_w, w in enumerate(sent_words): for ent in entities: if ent_has_enough_embs[ent]: continue ent_words = ent.split(" ") ent_embs = [] ent_words_len = len(ent_words) cnt = 0 while(cnt < ent_words_len): if ind_w + cnt >= len(sent_words) or sent_words[ind_w+cnt] != ent_words[cnt]: ent_embs = [] break else: #print(ind_w+1) #print(row_with_embeddings.get_token(ind_w+1)) #print(row_with_embeddings.get_token(ind_w+1).embedding) # flair get_token function is 1-based -> ind_w + 1 is needed ent_embs.append(np.array(row_with_embeddings.get_token(ind_w + 1).embedding)) cnt += 1 if len(ent_embs) > 0: ent_emb_lists[ent]["embeddings"].append(np.mean(ent_embs, axis=0)) ent_emb_lists[ent]["count"] += 1 # let's only take average of some mentions of them (for speed-up purposes) -- remove the followin if condition to average over all the entity mentions if ent_emb_lists[ent]["count"] > 0: ent_has_enough_embs[ent] = True print(ent , " --- Embedding found.") cnt_found_entities += 1 print("Number of found entities: ", cnt_found_entities) return ent_emb_lists
def test(dir_model, feature='LSTM'): if feature == 'BERT': model = BERT_CRF(tag_to_ix=tag_to_ix) checkpoint = torch.load(dir_model) model.load_state_dict(checkpoint) model = model.to(device) # 导入BERT预训练模型 embedding = BertEmbeddings('bert-base-chinese', '-1', 'mean') while True: print('输入文本,结束输入"quit":\n') text = input() if text != 'quit': with torch.no_grad(): # 文本转tensor x_test = Sentence(' '.join(text.replace(' ', '|'))) embedding.embed(x_test) x_test = torch.cat( [token.embedding.unsqueeze(0) for token in x_test], dim=0).unsqueeze(0).to(device) # 输出标注结果 test_tag = model(x_test)[0] tag = [ix_to_tag[ix] for ix in test_tag] # print(tag) result = re.finditer("S|BM*E", ''.join(tag)) # 定位实体,即"词语" result = [[m.start(), m.end()] for m in result] text_cut = '' for i in result: text_cut += ('/' + text[i[0]:i[1]]) print('\n分词结果:\n', text_cut, '\n') else: break else: # 导入训练好的模型 model = BiLSTM_CRF(vocab_size=num_words + 2, tag_to_ix=tag_to_ix, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM) checkpoint = torch.load(dir_model) model.load_state_dict(checkpoint) model = model.to(device) while True: print('输入文本,结束输入"quit":\n') text = input() if text != 'quit': with torch.no_grad(): # 文本转编码 x_test = [word_index.get(char, num_words) for char in text] x_test = torch.LongTensor([x_test]).to(device) # 输出标注结果 test_tag = model(x_test)[0] tag = [ix_to_tag[ix] for ix in test_tag] result = re.finditer("S|BM*E", ''.join(tag)) # 定位实体,即"词语" result = [[m.start(), m.end()] for m in result] text_cut = '' for i in result: text_cut += ('/' + text[i[0]:i[1]]) print('\n分词结果:\n', text_cut, '\n') else: break
def contextualizeSentences(strings, word_cluster): def cosine_similarity(a, b): return 1 - spatial.distance.cosine(a, b) def to_tokenized_string(sentence): tokenized = " ".join([t.text for t in sentence.tokens]) return tokenized def get_cluster(tok_vec, cc): max_sim = -10 max_sim_id = -1 for i, cluster_center in enumerate(cc): sim = cosine_similarity(tok_vec, cluster_center) if sim > max_sim: max_sim = sim max_sim_id = i return max_sim_id out = [] embedding = BertEmbeddings('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') for index,string in enumerate(strings): print("Contextualizing the corpus ", index) stop_words = set(stopwords.words('english')) stop_words.add('would') # this tokenizer is used to check for length > 512 sentences = sent_tokenize(string) for sentence_ind, sent in enumerate(sentences): tokenized_text = tokenizer.tokenize(sent) if len(tokenized_text) > 512: print('sentence too long for Bert: truncating') sentence = Sentence(' '.join(sent[:512]), use_tokenizer=True) else: sentence = Sentence(sent, use_tokenizer=True) try: embedding.embed(sentence) except: print(index) print(sentence) for token_ind, token in enumerate(sentence): word = token.text if word in stop_words: continue word_clean = word.translate(str.maketrans('', '', str_ing.punctuation)) if len(word_clean) == 0 or word_clean in stop_words or "/" in word_clean: continue try: cc = word_cluster[word_clean] except Exception as e: print("Exception Counter while getting clusters: ", index, e) continue # try: # cc = word_cluster[word] # except: # word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl" # word_path = cluster_dump_dir + word + "/cc.pkl" # try: # with open(word_clean_path, "rb") as handler: # cc = pickle.load(handler) # word_cluster[word_clean] = cc # except: # try: # with open(word_path, "rb") as handler: # cc = pickle.load(handler) # word_cluster[word] = cc # except Exception as e: if len(cc) > 1: tok_vec = token.embedding.cpu().numpy() cluster = get_cluster(tok_vec, cc) sentence.tokens[token_ind].text = word + "$" + str(cluster) sentences[sentence_ind] = to_tokenized_string(sentence) out.append(" . ".join(sentences)) return out
if args.layers == 'mean': embedding = BertEmbeddings(args.model_name, layers='-1,-2,-3,-4', use_scalar_mix=True, pooling_operation="mean") else: embedding = BertEmbeddings(args.model_name, layers=args.layers, pooling_operation="mean") if 'pubmed' in args.model_name.lower(): embedding.tokenizer.basic_tokenizer.do_lower_case = False flag = args.dataset dataset = [] with open(f'./datasets/unified/train.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/valid.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/test.{flag}.json') as f: dataset += json.load(f) bert_emb_dict = {} for item in tqdm(dataset): tokens = tuple(item['tokens']) s = form_sentence(tokens) embedding.embed(s) emb = get_embs(s) bert_emb_dict[tokens] = emb.astype('float16') with open(args.lm_emb_save_path, 'wb') as f: pickle.dump(bert_emb_dict, f)
from flair.embeddings import BertEmbeddings from flair.data import Sentence # init embedding embedding = BertEmbeddings(layers='-10') # create a sentence sentence = Sentence('The grass is green .') # embed words in sentence print(embedding.embed(sentence)) for token in sentence: print(token) print(token.embedding) print(token.embedding.shape)
for token in sentence: print(token) print(token.embedding) #Flair Embedding加载训练 flair_embedding_forward = FlairEmbeddings('model/news-forward-0.4.1.pt') sentence = Sentence('The grass is green .') flair_embedding_forward.embed(sentence) for token in sentence: print(token) print(token.embedding) #Bert Embedding加载训练 embedding = BertEmbeddings() sentence = Sentence('The grass is green .') embedding.embed(sentence) for token in sentence: print(token) print(token.embedding) #Elmo Embedding加载训练 embedding = ELMoEmbeddings() sentence = Sentence('The grass is green .') embedding.embed(sentence) for token in sentence: print(token) print(token.embedding) #混合Embedding加载训练 stacked_embeddings = StackedEmbeddings([WordEmbeddings('model/glove.gensim'), FlairEmbeddings('model/news-forward-0.4.1.pt')]) sentence = Sentence('The grass is green .')
len(fn.frames()) txt=preprocess.read_pg(data_root + r'\EN_1818_Shelley,Mary_Frankenstein_Novel.txt') print(len(txt), 'chars') from segtok.segmenter import split_single sentences = [Sentence(s, use_tokenizer=True) for s in split_single(txt)] print(len(sentences), 'sentences') import random as rand t = range(100)#rand.sample(range(len(sentences)), 100) sents_sample = [sentences[i] for i in sorted(t)] t = np.array(t) _ = bert_embedding.embed(sents_sample) from scipy.spatial.distance import cosine from torch.nn.functional import cosine_similarity from itertools import product def cosines(tokens): s = np.zeros([n,n]) for (i, j), _ in np.ndenumerate(s): s[i, j] = cosine(tokens[i], tokens[j]) return s def cosines(vecs, return_type=np.zeros): vecs = list(vecs) n = len(vecs) c = return_type([n,n])
def test(method='RNN'): if method not in ['RNN', 'BERT', 'BERT_RNN']: raise ValueError("method should be 'RNN','BERT' or 'BERT_RNN'") with open(dir_tokenizer, 'rb') as f: tokenizer = pickle.load(f) e_index = tokenizer.word_index['e'] if method == 'RNN': net = NET_RNN().to(device) checkpoint = torch.load(MODEL_PATH_RNN) net.load_state_dict(checkpoint) else: if method == 'BERT': net = NET_BERT().to(device) checkpoint = torch.load(MODEL_PATH_BERT) else: net = NET_BERT_RNN().to(device) checkpoint = torch.load(MODEL_PATH_BERT_RNN) net.load_state_dict(checkpoint) embedding = BertEmbeddings(bert_model_or_path=EMBEDDING, pooling_operation=POOLING, layers=BERT_LAYERS) while True: print('\n请输入文本,在此基础上作诗。不输入则随机开始,quit离开!\n') text = input('输入:') if text == 'quit': break elif text == '': text = np.random.choice(list(tokenizer.index_word.values())) if method == 'RNN': while True: x_seq_batch = tokenizer.texts_to_sequences(texts=[text]) x_seq_batch = torch.LongTensor(x_seq_batch).to(device) with torch.no_grad(): outputs = net(x_seq_batch) predicted = nn.Softmax(dim=0)(outputs.data.cpu()[-1]) predicted = np.random.choice(np.arange(len(predicted)), p=predicted.numpy()) if predicted not in [0, e_index]: text += tokenizer.index_word[predicted] else: break if len(text) >= opt.maxlen: break else: while True: text_p = ' '.join(text) sentence = Sentence(text_p) embedding.embed(sentence) x_seq_batch = torch.Tensor( [[token.embedding.numpy() for token in sentence]]) x_seq_batch = torch.Tensor(x_seq_batch).to(device) with torch.no_grad(): outputs = net(x_seq_batch) predicted = nn.Softmax(dim=0)(outputs.data.cpu()[-1]) predicted = np.random.choice(np.arange(len(predicted)), p=predicted.numpy()) if predicted not in [0, e_index]: text += tokenizer.index_word[predicted] else: break if len(text) >= opt.maxlen: break text_list = re.findall(pattern='[^。?!]*[。?!]', string=text) print('创作完成:\n') for i in text_list: print(i)
from flair.data import Sentence from flair.embeddings import BertEmbeddings # instantiate BERT embeddings bert_embeddings = BertEmbeddings() # make example sentence sentence = Sentence('I love Berlin.', use_tokenizer=True) # embed sentence bert_embeddings.embed(sentence) # print embedded tokens for token in sentence: print(token) print(token.embedding)
bert_embedding = BertEmbeddings(args.bert_name, layers='-1,-2,-3,-4', use_scalar_mix=True, pooling_operation="mean") flag = args.dataset dataset = [] with open(f'./datasets/unified/train.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/valid.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/test.{flag}.json') as f: dataset += json.load(f) bert_emb_dict = {} for item in tqdm(dataset): tokens = tuple(item['tokens']) s = form_sentence(tokens) s.clear_embeddings() bert_embedding.embed(s) emb = get_embs(s) # (T, 4*H) s.clear_embeddings() albert_embedding.embed(s) emb = np.concatenate([emb, get_embs(s)], axis=-1) bert_emb_dict[tokens] = emb.astype('float16') with open(args.lm_emb_save_path, 'wb') as f: pickle.dump(bert_emb_dict, f)
job_desc = re.sub(generic_re, '', job_desc) all_sentances = [] doc = sent_nlp(job_desc) for sent in doc.sents: all_sentances.append(sent.string.strip()) for sentance in all_sentances: if len(sentance) >= 5 and len(sentance) < 512: doc = Sentence(sentance, use_tokenizer=build_spacy_tokenizer(sent_nlp)) predictions = tagger.predict(doc) labels_dict = predictions[0].to_dict(tag_type='ner') all_entities = [item['text'] for item in labels_dict['entities']] embeddings.embed(doc) for token in doc: if token.text in all_entities: tensor = token.embedding.detach().cpu().numpy() skill_embeddings.append((token.text, tensor)) from sklearn.cluster import KMeans import numpy as np embeddings_df = pd.DataFrame(skill_embeddings, columns=['skill', 'embedding']) embeddings_df['skill'] = embeddings_df['skill'].map(lambda x: x.lower()) skill_counts = embeddings_df.groupby('skill').size() avg_embed = embeddings_df.groupby('skill')['embedding'].apply(np.mean) full_df = pd.concat([skill_counts, avg_embed], axis=1) full_df.columns = ['count', 'embedding'] full_df = full_df.loc[full_df['count'] >= 5]