Esempio n. 1
0
def main():
    args = _parse_args()
    tsv_path = args.tsv_path

    embedding = BertEmbeddings('bert-base-cased')

    sentences = [[]]
    with open(tsv_path, 'r') as f:
        for i, l in enumerate(f.readlines()):
            if l.strip():
                token, *_ = l.strip().split('\t')
                sentences[-1].append(token.lower())
            else:
                sentences.append([])

    f_sentences = [Sentence(' '.join(s)) for s in sentences]

    for s in progressbar.progressbar(f_sentences):
        embedding.embed(s)

        for t in s:
            print('\t'.join(t.embedding.numpy().astype(str)))
        print()

        s.clear_embeddings()
Esempio n. 2
0
class BertPretrained(ModelBase):
    """
    Encapsulates pretrained Bert Embeddings (from Zalando Flair) by conforming to the ModelBase interface.
    """

    def __init__(self, model: Optional[BertEmbeddings] = None):
        super(BertPretrained, self).__init__()

        if model is not None:
            self.model = model
        else:
            self.model = BertEmbeddings('bert-base-uncased')

    def dim(self) -> int:
        """
        The dimensionality of created embeddings.

        :return: 3072 (for now, #TODO)
        """
        return 3072

    def get_word_vector(self, word: str) -> Optional[np.ndarray]:
        """
        Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the
        purpose of Bert embeddings. Instead, utilize the context as well for more accurate vectorization.

        In reality, Bert embeddings never return None, even for bogus words.

        :param word: The word to vectorize.
        :return: Either the word vector or None.
        """
        dummy_sentence = Sentence(word)
        self.model.embed(dummy_sentence)
        return np.array(list(dummy_sentence)[0].embedding)

    def get_word_vectors(self, words: List[str]) -> List[np.ndarray]:
        """
        Vectorizes the list of words, using pretrained Bert embeddings. These embeddings are context dependent, so this
        method is preferred over fetching word vectors for single words.

        :param words: The list of words to vectorize.
        :return: A list of word vectors.
        """
        sentence = Sentence(' '.join(words))
        self.model.embed(sentence)
        return list(
            map(lambda token: np.array(token.embedding),
                list(sentence))
        )

    def vectorize_context(self, words: List[str]) -> Optional[np.ndarray]:
        """
        Transforms the context into a single vector. May return None in extreme cases, e.g. if |words| is an empty list.

        :param words: List of tokens describing the context.
        :return: A single word vector or None.
        """
        return self.mean_of_words(self.get_word_vectors(words))
Esempio n. 3
0
class SentenceBertEmbedderSensor(SentenceSensor):
    def __init__(self, *pres):
        super().__init__(*pres)
        self.bert_embedding = BertEmbeddings()
 
    def forward(
        self,
    ) -> Any:
        self.bert_embedding.embed(self.fetch_value(self.sentence_value))
        return None
Esempio n. 4
0
def get_flair_bert_embeddings(words):

    # Experimental -- not tested

    from flair.embeddings import BertEmbeddings

    bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

    sentence = Sentence(words)
    bert_embedding.embed(sentence)

    return (sentence)
Esempio n. 5
0
    def dump_bert_vecs(df, dump_dir):
        print("Getting BERT vectors...")
        embedding = BertEmbeddings('bert-base-uncased')
        word_counter = defaultdict(int)
        stop_words = set(stopwords.words('english'))
        stop_words.add("would")
        except_counter = 0

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        for index, row in df.iterrows():
            if index % 100 == 0:
                print("Finished sentences: " + str(index) + " out of " +
                      str(len(df)))
            #all sentences are undercase now
            line = row["sentence"].lower()
            sentences = sent_tokenize(line)
            for sentence_ind, sent in enumerate(sentences):
                tokenized_text = tokenizer.tokenize(sent)
                if len(tokenized_text) > 512:
                    print('sentence too long for Bert: truncating')
                    sentence = Sentence(' '.join(sent[:512]),
                                        use_tokenizer=True)
                else:
                    sentence = Sentence(sent, use_tokenizer=True)
                try:
                    embedding.embed(sentence)
                except Exception as e:
                    except_counter += 1
                    print("Exception Counter while getting BERT: ",
                          except_counter, sentence_ind, index, e)
                    print(sentence)
                    continue
                for token_ind, token in enumerate(sentence):
                    word = token.text
                    word = word.translate(
                        str.maketrans('', '', string.punctuation))
                    if word in stop_words or "/" in word or len(word) == 0:
                        continue
                    word_dump_dir = dump_dir + word
                    os.makedirs(word_dump_dir, exist_ok=True)
                    fname = word_dump_dir + "/" + str(
                        word_counter[word]) + ".pkl"
                    word_counter[word] += 1
                    vec = token.embedding.cpu().numpy()
                    try:
                        with open(fname, "wb") as handler:
                            pickle.dump(vec, handler)
                    except Exception as e:
                        except_counter += 1
                        print("Exception Counter while dumping BERT: ",
                              except_counter, sentence_ind, index, word, e)
class BertEmbedding(EmbeddingBase):
    def __init__(self):
        self.model = BertEmbeddings(
            bert_model_or_path="bert-base-multilingual-cased")
        self.size = 3072

    def _get_vector(self, sentence: Sentence) -> np.ndarray:
        res = np.zeros(self.size, dtype=np.float32)
        for token in sentence.tokens:
            vec = np.fromiter(token.embedding.tolist(), dtype=np.float32)
            vec = vec / np.linalg.norm(vec, ord=2)
            res += vec
        res /= len(sentence.tokens)
        return res

    def batcher(self, params, batch: List[List[str]]) -> np.ndarray:
        batch = [
            Sentence(" ".join(sent)) if sent != [] else ['.'] for sent in batch
        ]
        embeddings = []
        sentences = self.model.embed(batch)
        for sent in sentences:
            embeddings.append(self._get_vector(sent))
        embeddings = np.vstack(embeddings)
        return embeddings

    def dim(self) -> int:
        return self.size
Esempio n. 7
0
class Bert(nn.Module):
    def __init__(self, idx2word, device=torch.device('cpu')):
        super(Bert, self).__init__()
        self.idx2word = idx2word
        self.embed_size = sizes["bert"]
        self.bert = BertEmbeddings('bert-base-uncased', '-2')

    def proc(self, string):
        if string == '.':
            return "[SEP]"

        if string == "__":
            return "[MASK]"

        return string

    def forward(self, batch):
        # TODO: fill this in
        batch_as_words = [[
            self.proc(str(self.idx2word[token])) for token in l
        ] for l in batch.transpose(0, 1).tolist()]
        batch_as_sentences = [Sentence(' '.join(l)) for l in batch_as_words]
        embeds = self.bert.embed(batch_as_sentences)
        embeds = [[token.embedding for token in sentence]
                  for sentence in embeds]
        return torch.stack([torch.stack(sentence)
                            for sentence in embeds]).transpose(0, 1).cuda()
Esempio n. 8
0
def get_Bert_embeddings(vocab, dim):
    from flair.embeddings import BertEmbeddings
    from flair.data import Sentence

    _embeddings = np.zeros([len(vocab), dim])
    temp = []
    for each_word in vocab:
        temp.append(each_word)
    sentence = Sentence(' '.join(temp))

    embedding = BertEmbeddings()

    embedding.embed(sentence)
    for token in sentence:
        _embeddings[vocab[token.text]] = token.embedding

    return _embeddings
Esempio n. 9
0
def get_Bert_embeddings(vocab, dim):
    from flair.embeddings import BertEmbeddings
    from flair.data import Sentence

    _embeddings = np.zeros([len(vocab), dim])
    temp = []
    for each_word in vocab:
        temp.append(each_word)
    sentence = Sentence(' '.join(temp))

    embedding = BertEmbeddings()

    embedding.embed(sentence)
    for token in sentence:
        try:
            _embeddings[vocab[token.text]] = token.embedding
        except KeyError:
            log.warning(f'Bad token {token.text} for Bert embedding')

    return _embeddings
Esempio n. 10
0
class BertEmbedder:
	"""Embed Bert Embeddings"""	
	def __init__(self, len, emb='en'):
		"""
		Args:
			len (int): max length for the model input
			lang (str, optional): embedding language. Defaults to 'en'.
		"""		
		if emb=='en':	self.embedder = BertEmbeddings("distilbert-base-uncased")
		self.MAX_LEN = len
		
	def embed_sentence(self, sentence):
		"""This function embed each sentence with BERT embedder

		Args:
			sentence (str): raw sentence

		Returns:
			np.array: embedded matrix
		"""		
		flair_sentence = Sentence(sentence)
		while len(flair_sentence) < self.MAX_LEN:	flair_sentence.add_token(Token("__PAD__"))
		self.embedder.embed(flair_sentence)
		return np.stack([t.embedding.cpu().numpy() for t in flair_sentence])
Esempio n. 11
0
    def contextualize(df, cluster_dump_dir):
        def get_cluster(tok_vec, cc):
            max_sim = -10
            max_sim_id = -1
            for i, cluster_center in enumerate(cc):
                sim = cosine_similarity(tok_vec, cluster_center)
                if sim > max_sim:
                    max_sim = sim
                    max_sim_id = i
            return max_sim_id

        print("Contextualizing the corpus..")
        embedding = BertEmbeddings('bert-base-uncased')
        stop_words = set(stopwords.words('english'))
        stop_words.add('would')
        except_counter = 0
        word_cluster = {}

        for index, row in df.iterrows():
            if index % 100 == 0:
                print("Finished rows: " + str(index) + " out of " +
                      str(len(df)))
            line = row["sentence"]
            sentences = sent_tokenize(line)
            for sentence_ind, sent in enumerate(sentences):
                sentence = Sentence(sent, use_tokenizer=True)
                embedding.embed(sentence)
                for token_ind, token in enumerate(sentence):
                    word = token.text
                    if word in stop_words:
                        continue
                    word_clean = word.translate(
                        str.maketrans('', '', string.punctuation))
                    if len(
                            word_clean
                    ) == 0 or word_clean in stop_words or "/" in word_clean:
                        continue
                    try:
                        cc = word_cluster[word_clean]
                    except:
                        try:
                            cc = word_cluster[word]
                        except:
                            word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl"
                            word_path = cluster_dump_dir + word + "/cc.pkl"
                            try:
                                with open(word_clean_path, "rb") as handler:
                                    cc = pickle.load(handler)
                                word_cluster[word_clean] = cc
                            except:
                                try:
                                    with open(word_path, "rb") as handler:
                                        cc = pickle.load(handler)
                                    word_cluster[word] = cc
                                except Exception as e:
                                    except_counter += 1
                                    print(
                                        "Exception Counter while getting clusters: ",
                                        except_counter, index, e)
                                    continue

                    if len(cc) > 1:
                        tok_vec = token.embedding.cpu().numpy()
                        cluster = get_cluster(tok_vec, cc)
                        sentence.tokens[token_ind].text = word + "$" + str(
                            cluster)
                sentences[sentence_ind] = to_tokenized_string(sentence)
            df["sentence"][index] = " . ".join(sentences)
        return df, word_cluster
Esempio n. 12
0
    def get_ent_emb_dict(self, df_ent_final_ranking, only_top_N_entitis = 10, dump_flair_res_to_pickle=False):
        print("In function: get_ent_emb_dict")
        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' #To get rid of this error: OMP: Error #15: Initializing libomp.dylib, but found libiomp5.dylib already initialized.
        bert_embedding = BertEmbeddings('bert-base-cased')#do_lower_case=False
        if dump_flair_res_to_pickle:
            PIK = self.base_dir + "flair_res_embeddings.pkl"
            print("saving pickle object at:  ", PIK)
            f = open(PIK, "wb")
            pickle.dump(len(self.df_extractions), f)

        entities = df_ent_final_ranking["entity"][:only_top_N_entitis]
        ent_emb_lists = {}
        ent_has_enough_embs = {}
        for ind, ent in enumerate(entities):
            ent_emb_lists[ent] = {"count": 0, "type": df_ent_final_ranking.iloc[ind]["type"], "embeddings": []}
            ent_has_enough_embs[ent] = False

        cnt_found_entities = 0
        for ind_row, row in self.df_extractions.iterrows():
            if ind_row % 500 == 0:
                print(ind_row)
            #if ind_row > 1000:
            #    break
            has_entity = False
            for ent in entities:
                if ent_has_enough_embs[ent]:
                    continue
                if ent in row["sentence"].lower():
                    has_entity = True
                    break
            if not has_entity:
                continue
            sent_words = [t.text.lower() for t in row["flair_res"]]
            row_with_embeddings = row["flair_res"]
            bert_embedding.embed(row_with_embeddings)
            if dump_flair_res_to_pickle:
                pickle.dump(row_with_embeddings, f)
            #sent = self._get_sentence_space_delimited(row)
            #sent = sent.lower()
            #sent_words = sent.split(" ")
            '''
            Algo:
            for every word (w) in sentence, find the embedding for entities that start from w.
            '''
            for ind_w, w in enumerate(sent_words):
                for ent in entities:
                    if ent_has_enough_embs[ent]:
                        continue
                    ent_words = ent.split(" ")
                    ent_embs = []
                    ent_words_len = len(ent_words)
                    cnt = 0
                    while(cnt < ent_words_len):
                        if ind_w + cnt >= len(sent_words) or sent_words[ind_w+cnt] != ent_words[cnt]:
                            ent_embs = []
                            break
                        else:
                            #print(ind_w+1)
                            #print(row_with_embeddings.get_token(ind_w+1))
                            #print(row_with_embeddings.get_token(ind_w+1).embedding)
                            # flair get_token function is 1-based -> ind_w + 1 is needed
                            ent_embs.append(np.array(row_with_embeddings.get_token(ind_w + 1).embedding))
                        cnt += 1
                    if len(ent_embs) > 0:
                        ent_emb_lists[ent]["embeddings"].append(np.mean(ent_embs, axis=0))
                        ent_emb_lists[ent]["count"] += 1
                        # let's only take average of some mentions of them (for speed-up purposes) -- remove the followin if condition to average over all the entity mentions
                        if ent_emb_lists[ent]["count"] > 0:
                            ent_has_enough_embs[ent] = True
                            print(ent , " --- Embedding found.")
                            cnt_found_entities += 1
                            print("Number of found entities: ", cnt_found_entities)

        return ent_emb_lists
Esempio n. 13
0
def test(dir_model, feature='LSTM'):
    if feature == 'BERT':
        model = BERT_CRF(tag_to_ix=tag_to_ix)
        checkpoint = torch.load(dir_model)
        model.load_state_dict(checkpoint)
        model = model.to(device)

        # 导入BERT预训练模型
        embedding = BertEmbeddings('bert-base-chinese', '-1', 'mean')
        while True:
            print('输入文本,结束输入"quit":\n')
            text = input()
            if text != 'quit':
                with torch.no_grad():
                    # 文本转tensor
                    x_test = Sentence(' '.join(text.replace(' ', '|')))
                    embedding.embed(x_test)
                    x_test = torch.cat(
                        [token.embedding.unsqueeze(0) for token in x_test],
                        dim=0).unsqueeze(0).to(device)
                    # 输出标注结果
                    test_tag = model(x_test)[0]
                    tag = [ix_to_tag[ix] for ix in test_tag]
                    # print(tag)
                    result = re.finditer("S|BM*E", ''.join(tag))
                    # 定位实体,即"词语"
                    result = [[m.start(), m.end()] for m in result]
                    text_cut = ''
                    for i in result:
                        text_cut += ('/' + text[i[0]:i[1]])

                    print('\n分词结果:\n', text_cut, '\n')
            else:
                break
    else:
        # 导入训练好的模型
        model = BiLSTM_CRF(vocab_size=num_words + 2,
                           tag_to_ix=tag_to_ix,
                           embedding_dim=EMBEDDING_DIM,
                           hidden_dim=HIDDEN_DIM)
        checkpoint = torch.load(dir_model)
        model.load_state_dict(checkpoint)
        model = model.to(device)
        while True:
            print('输入文本,结束输入"quit":\n')
            text = input()
            if text != 'quit':
                with torch.no_grad():
                    # 文本转编码
                    x_test = [word_index.get(char, num_words) for char in text]
                    x_test = torch.LongTensor([x_test]).to(device)
                    # 输出标注结果
                    test_tag = model(x_test)[0]
                    tag = [ix_to_tag[ix] for ix in test_tag]
                    result = re.finditer("S|BM*E", ''.join(tag))
                    # 定位实体,即"词语"
                    result = [[m.start(), m.end()] for m in result]
                    text_cut = ''
                    for i in result:
                        text_cut += ('/' + text[i[0]:i[1]])

                    print('\n分词结果:\n', text_cut, '\n')
            else:
                break
Esempio n. 14
0
def contextualizeSentences(strings, word_cluster):

    def cosine_similarity(a, b):
        return 1 - spatial.distance.cosine(a, b)

    def to_tokenized_string(sentence):
        tokenized = " ".join([t.text for t in sentence.tokens])
        return tokenized

    def get_cluster(tok_vec, cc):
        max_sim = -10
        max_sim_id = -1
        for i, cluster_center in enumerate(cc):
            sim = cosine_similarity(tok_vec, cluster_center)
            if sim > max_sim:
                max_sim = sim
                max_sim_id = i
        return max_sim_id

    out = []
    embedding = BertEmbeddings('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    for index,string in enumerate(strings):
        print("Contextualizing the corpus ", index)
        stop_words = set(stopwords.words('english'))
        stop_words.add('would')

        # this tokenizer is used to check for length > 512
        sentences = sent_tokenize(string)
        for sentence_ind, sent in enumerate(sentences):
            tokenized_text = tokenizer.tokenize(sent)
            if len(tokenized_text) > 512:
                print('sentence too long for Bert: truncating')
                sentence = Sentence(' '.join(sent[:512]), use_tokenizer=True)
            else:
                sentence = Sentence(sent, use_tokenizer=True)
            try:
                embedding.embed(sentence)
            except:
                print(index)
                print(sentence)
            for token_ind, token in enumerate(sentence):
                word = token.text
                if word in stop_words:
                    continue
                word_clean = word.translate(str.maketrans('', '', str_ing.punctuation))
                if len(word_clean) == 0 or word_clean in stop_words or "/" in word_clean:
                    continue
                try:
                    cc = word_cluster[word_clean]
                except Exception as e:
                    print("Exception Counter while getting clusters: ", index, e)
                    continue
                    # try:
                    #     cc = word_cluster[word]
                    # except:
                    #     word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl"
                    #     word_path = cluster_dump_dir + word + "/cc.pkl"
                    #     try:
                    #         with open(word_clean_path, "rb") as handler:
                    #             cc = pickle.load(handler)
                    #         word_cluster[word_clean] = cc
                    #     except:
                    #         try:
                    #             with open(word_path, "rb") as handler:
                    #                 cc = pickle.load(handler)
                    #             word_cluster[word] = cc
                    #         except Exception as e:

                if len(cc) > 1:
                    tok_vec = token.embedding.cpu().numpy()
                    cluster = get_cluster(tok_vec, cc)
                    sentence.tokens[token_ind].text = word + "$" + str(cluster)
            sentences[sentence_ind] = to_tokenized_string(sentence)
            out.append(" . ".join(sentences))
    return out
Esempio n. 15
0
if args.layers == 'mean':
    embedding = BertEmbeddings(args.model_name, layers='-1,-2,-3,-4', use_scalar_mix=True, pooling_operation="mean")
else:
    embedding = BertEmbeddings(args.model_name, layers=args.layers, pooling_operation="mean")
    
if 'pubmed' in args.model_name.lower():
    embedding.tokenizer.basic_tokenizer.do_lower_case = False


flag = args.dataset
dataset = []
with open(f'./datasets/unified/train.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/valid.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/test.{flag}.json') as f:
    dataset += json.load(f)
    
    
bert_emb_dict = {}
for item in tqdm(dataset):
    tokens = tuple(item['tokens'])
    s = form_sentence(tokens)
    embedding.embed(s)
    emb = get_embs(s)
    bert_emb_dict[tokens] = emb.astype('float16')
    
    
with open(args.lm_emb_save_path, 'wb') as f:
    pickle.dump(bert_emb_dict, f)
Esempio n. 16
0
from flair.embeddings import BertEmbeddings
from flair.data import Sentence

# init embedding
embedding = BertEmbeddings(layers='-10')

# create a sentence
sentence = Sentence('The grass is green .')

# embed words in sentence
print(embedding.embed(sentence))

for token in sentence:
    print(token)
    print(token.embedding)
    print(token.embedding.shape)
Esempio n. 17
0
for token in sentence:
    print(token)
    print(token.embedding)

#Flair Embedding加载训练
flair_embedding_forward = FlairEmbeddings('model/news-forward-0.4.1.pt')
sentence = Sentence('The grass is green .')
flair_embedding_forward.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Bert Embedding加载训练
embedding = BertEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Elmo Embedding加载训练
embedding = ELMoEmbeddings()
sentence = Sentence('The grass is green .')
embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#混合Embedding加载训练
stacked_embeddings = StackedEmbeddings([WordEmbeddings('model/glove.gensim'), FlairEmbeddings('model/news-forward-0.4.1.pt')])
sentence = Sentence('The grass is green .')
Esempio n. 18
0
len(fn.frames())

txt=preprocess.read_pg(data_root + r'\EN_1818_Shelley,Mary_Frankenstein_Novel.txt')
print(len(txt), 'chars')

from segtok.segmenter import split_single
sentences = [Sentence(s, use_tokenizer=True) for s in split_single(txt)]
print(len(sentences), 'sentences')

import random as rand

t = range(100)#rand.sample(range(len(sentences)), 100)
sents_sample = [sentences[i] for i in sorted(t)]

t = np.array(t)
_ = bert_embedding.embed(sents_sample)

from scipy.spatial.distance import cosine
from torch.nn.functional import cosine_similarity
from itertools import product

def cosines(tokens):
    s = np.zeros([n,n])
    for (i, j), _ in np.ndenumerate(s):
        s[i, j] = cosine(tokens[i], tokens[j])
    return s

def cosines(vecs, return_type=np.zeros):
    vecs = list(vecs)
    n = len(vecs)
    c = return_type([n,n])
Esempio n. 19
0
def test(method='RNN'):
    if method not in ['RNN', 'BERT', 'BERT_RNN']:
        raise ValueError("method should be 'RNN','BERT' or 'BERT_RNN'")
    with open(dir_tokenizer, 'rb') as f:
        tokenizer = pickle.load(f)
    e_index = tokenizer.word_index['e']

    if method == 'RNN':
        net = NET_RNN().to(device)
        checkpoint = torch.load(MODEL_PATH_RNN)
        net.load_state_dict(checkpoint)
    else:
        if method == 'BERT':
            net = NET_BERT().to(device)
            checkpoint = torch.load(MODEL_PATH_BERT)
        else:
            net = NET_BERT_RNN().to(device)
            checkpoint = torch.load(MODEL_PATH_BERT_RNN)
        net.load_state_dict(checkpoint)
        embedding = BertEmbeddings(bert_model_or_path=EMBEDDING,
                                   pooling_operation=POOLING,
                                   layers=BERT_LAYERS)
    while True:
        print('\n请输入文本,在此基础上作诗。不输入则随机开始,quit离开!\n')
        text = input('输入:')
        if text == 'quit':
            break
        elif text == '':
            text = np.random.choice(list(tokenizer.index_word.values()))

        if method == 'RNN':
            while True:
                x_seq_batch = tokenizer.texts_to_sequences(texts=[text])
                x_seq_batch = torch.LongTensor(x_seq_batch).to(device)
                with torch.no_grad():
                    outputs = net(x_seq_batch)
                predicted = nn.Softmax(dim=0)(outputs.data.cpu()[-1])
                predicted = np.random.choice(np.arange(len(predicted)),
                                             p=predicted.numpy())
                if predicted not in [0, e_index]:
                    text += tokenizer.index_word[predicted]
                else:
                    break
                if len(text) >= opt.maxlen:
                    break
        else:
            while True:
                text_p = ' '.join(text)
                sentence = Sentence(text_p)
                embedding.embed(sentence)
                x_seq_batch = torch.Tensor(
                    [[token.embedding.numpy() for token in sentence]])
                x_seq_batch = torch.Tensor(x_seq_batch).to(device)
                with torch.no_grad():
                    outputs = net(x_seq_batch)
                predicted = nn.Softmax(dim=0)(outputs.data.cpu()[-1])
                predicted = np.random.choice(np.arange(len(predicted)),
                                             p=predicted.numpy())
                if predicted not in [0, e_index]:
                    text += tokenizer.index_word[predicted]
                else:
                    break
                if len(text) >= opt.maxlen:
                    break
        text_list = re.findall(pattern='[^。?!]*[。?!]', string=text)
        print('创作完成:\n')
        for i in text_list:
            print(i)
Esempio n. 20
0
from flair.data import Sentence
from flair.embeddings import BertEmbeddings

# instantiate BERT embeddings
bert_embeddings = BertEmbeddings()

# make example sentence
sentence = Sentence('I love Berlin.', use_tokenizer=True)

# embed sentence
bert_embeddings.embed(sentence)

# print embedded tokens
for token in sentence:
    print(token)
    print(token.embedding)
Esempio n. 21
0
bert_embedding = BertEmbeddings(args.bert_name,
                                layers='-1,-2,-3,-4',
                                use_scalar_mix=True,
                                pooling_operation="mean")

flag = args.dataset
dataset = []
with open(f'./datasets/unified/train.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/valid.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/test.{flag}.json') as f:
    dataset += json.load(f)

bert_emb_dict = {}
for item in tqdm(dataset):
    tokens = tuple(item['tokens'])
    s = form_sentence(tokens)

    s.clear_embeddings()
    bert_embedding.embed(s)
    emb = get_embs(s)  # (T, 4*H)

    s.clear_embeddings()
    albert_embedding.embed(s)
    emb = np.concatenate([emb, get_embs(s)], axis=-1)

    bert_emb_dict[tokens] = emb.astype('float16')

with open(args.lm_emb_save_path, 'wb') as f:
    pickle.dump(bert_emb_dict, f)
Esempio n. 22
0
        job_desc = re.sub(generic_re, '', job_desc)

    all_sentances = []
    doc = sent_nlp(job_desc)
    for sent in doc.sents:
        all_sentances.append(sent.string.strip())
    for sentance in all_sentances:
        if len(sentance) >= 5 and len(sentance) < 512:

            doc = Sentence(sentance,
                           use_tokenizer=build_spacy_tokenizer(sent_nlp))
            predictions = tagger.predict(doc)
            labels_dict = predictions[0].to_dict(tag_type='ner')

            all_entities = [item['text'] for item in labels_dict['entities']]
            embeddings.embed(doc)
            for token in doc:
                if token.text in all_entities:
                    tensor = token.embedding.detach().cpu().numpy()
                    skill_embeddings.append((token.text, tensor))

from sklearn.cluster import KMeans
import numpy as np

embeddings_df = pd.DataFrame(skill_embeddings, columns=['skill', 'embedding'])
embeddings_df['skill'] = embeddings_df['skill'].map(lambda x: x.lower())
skill_counts = embeddings_df.groupby('skill').size()
avg_embed = embeddings_df.groupby('skill')['embedding'].apply(np.mean)
full_df = pd.concat([skill_counts, avg_embed], axis=1)
full_df.columns = ['count', 'embedding']
full_df = full_df.loc[full_df['count'] >= 5]