def __init__(self, max_seq_length, batch_size): self.batch_size = batch_size if batch_size == 1: self.embedder = BertEmbedding(512) print("seq length set to Bert maximum 512 when batch size is 1") else: self.embedder = BertEmbedding(max_seq_length)
def __init__(self): super().__init__('bert') self.output_format = { 'n_cols': 768 } self.model = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')
def __init__(self, model='bert_24_1024_16', corpus='book_corpus_wiki_en_cased'): self.__model = model self.__corpus = corpus assert self.__model in ['bert_12_768_12', 'bert_24_1024_16'], "Model is not recognized." assert self.__corpus in [ 'book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased', 'wiki_multilingual', 'wiki_multilingual_cased' ], "Corpus is unknown." self.__bert = BertEmbedding(model=self.__model, dataset_name=self.__corpus)
def __init__(self, model, ignore_stopwords=True): self.model = model if self.model == "elmo": self.MODEL = ElmoEmbedder() elif self.model == "bert": self.MODEL = BertEmbedding() elif self.model == 'roberta-large': self.num_layers = 17 self.tokenizer = AutoTokenizer.from_pretrained(self.model) self.MODEL = AutoModel.from_pretrained(self.model) self.MODEL.encoder.layer = torch.nn.ModuleList( [layer for layer in self.MODEL.encoder.layer[:self.num_layers]]) self.MODEL.eval() self.nlp = spacy.load("en_core_web_md") self.ignore_stopwords = ignore_stopwords
def createCustomBertEmbeddings(text: List[str], firstWordOnly=True, saveName: Union[str, Path] = None): from bert_embedding import BertEmbedding bert_embedding = BertEmbedding() result = bert_embedding(text) indices = [r[0] for r in result] data = [r[1] for r in result] if firstWordOnly: outIdxs = [] outData = [] for ii, idx in enumerate(indices): for jj, word in enumerate(idx): if word.isalnum(): outIdxs.append(word) outData.append(data[ii][jj]) break indices = outIdxs data = outData else: indices = np.concatenate(indices) data = np.row_stack(data) out = pd.DataFrame(index=indices, data=data) if saveName: out.to_csv(saveName) return out
class Bert(Embedder): def __init__(self): super().__init__('bert') self.output_format = { 'n_cols': 768 } self.model = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased') # Core functions def embed_text(self, abstracts): """ :param abstracts: pandas Series of abstracts :param output_format: dict specifying output format of the embedding method :return: embedding and associated format """ bert_embedding = self.model.bert(abstracts.tolist(), oov_way='sum') embedding = [] for _, vectors in bert_embedding: embedding.append(sum(vectors)) embedding = pd.DataFrame(embedding) return embedding, self.output_format
class bert_instance(): bert_embedding = BertEmbedding() def matrix(self, processed_texts, enquiry): all_vectors = [] enquiry_vector = [] enquiry_vector.append(self.return_vectors(enquiry)) enquiry_vector = np.array(enquiry_vector) for text in processed_texts: all_vectors.append(self.return_vectors(text)) all_vectors = np.array(all_vectors) print(all_vectors) print(all_vectors.shape) matrix = cosine_similarity(all_vectors, enquiry_vector) return (matrix) def return_vectors(self, text): vectorfile = self.bert_embedding([text]) print(len(vectorfile)) #for i in range(len(vectorfile)): vectorlist = vectorfile[0][1] #print(vectorlist) sum_vector = np.empty(shape=vectorlist[0].shape) sum_amt = 0 for vector in vectorlist: sum_amt += (vector[0]) sum_vector += vector sum_vector /= len(vectorlist) sum_vector = np.nan_to_num(sum_vector) return (sum_vector)
def getEmbeddings(df, n_restaurants=None, average=True, cuisines=True, full=False): print('Cleaning Zomato data') cuisines, urls, names = zomatoPreprocess(df, cuisines=cuisines, full=full) if n_restaurants == None: n_restaurants = len(cuisines) print('Retrieving BERT sentence representations for {} restuarants...\n'. format(n_restaurants)) __bert_embedding = BertEmbedding(model='bert_12_768_12') __berts = __bert_embedding(cuisines[:n_restaurants]) bagofembeddings = __bagofBERTs(names, average, urls, __berts) print('Complete.') filtrd = [(n, u, c, e) for n, u, c, e in bagofembeddings if len(e.shape) > 0] cuisines = [c for n, u, c, e in filtrd] embeds = [e for n, u, c, e in filtrd] names = [n for n, u, c, e in filtrd] urls = [u for n, u, c, e in filtrd] return bagofembeddings
def __init__(self): self.bert_embedding = BertEmbedding() # self.vectors_bank_dic = load_obj("word2vec") basepath = os.path.abspath(".") data = bz2.BZ2File(basepath + "\data\\bert\word2vectorUpdate.pbz2", 'rb') self.vectors_bank_dic = cPickle.load(data)
def run(): # In[5]: baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D) # In[6]: baseline.load_ids(DIR) print(len(baseline.bug_ids)) # In[8]: # #### Read the corpus from bugs load_bugs(baseline) # In[9]: sent_title = [ baseline.bug_set[bug_id]['title'][:MAX_SEQUENCE_LENGTH_T] for bug_id in baseline.bug_ids ] sent_desc = [ baseline.bug_set[bug_id]['description'][:MAX_SEQUENCE_LENGTH_D] for bug_id in baseline.bug_ids ] # In[10]: print(len(sent_title), len(sent_desc)) # ### BERT embedding # In[11]: ctx = mx.gpu(0) bert_embedding = BertEmbedding(ctx, batch_size=32, max_seq_length=MAX_SEQUENCE_LENGTH_D) # ### Save dataset vocabulary embedding # In[23]: # res = paralelize_processing([baseline.bug_ids, sent_title, sent_desc], # vectorizing_bugs, (baseline.DIR, bert_embedding, baseline.bug_set, )) vectorizing_bugs([baseline.bug_ids, sent_title, sent_desc], baseline.DIR, bert_embedding, baseline.bug_set) print("Vectorized all dataset with BERT.") # In[ ]: bug_selected = np.random.choice(baseline.bug_ids, 1)[0] bug = baseline.bug_set[bug_selected] print("Testing if a random bug has bert embeddings") assert len(bug['title_bert_embed']) == 768 assert len(bug['desc_bert_embed']) == 768 print("Embedding with BERT trained!")
def get_bert_embedding_of_several_words_as_pd_df(logger = None , phrase_in = None , root_colnames = 'dim_' , dim_vector_rep = 768): try: lst_phrase = [phrase_in] colnames = ['{0}{1}'.format(pd_colnames_root, i) for i in range(1, dim_vector_rep + 1)] if logger is not None: logger.info(' - computing BERT representation for input token: \'{0}\''.format(phrase_in)) bert_embedding = BertEmbedding() bert_rep = bert_embedding(lst_phrase) lst_words = bert_rep[0][0] lst_bert_rep = bert_rep[0][1] w_context_vect = pd.DataFrame(data = {'id_context': [i for i in range(1, len(lst_words) + 1)] , 'w_context': lst_words , 'w_context_bert': lst_bert_rep }) df_vect = pd.concat([pd.DataFrame(data = w_context_vect['w_context_bert'][i].reshape(1, len(w_context_vect['w_context_bert'][i])) , columns = colnames) for i in range(len(w_context_vect.index))]) df_vect.index = w_context_vect.index w_context_vect = pd.concat([w_context_vect[['id_context', 'w_context']], df_vect], axis = 1) except Exception: if logger is not None: logger.exception("ERROR getting BERT-embedding of token \'{0}\'".format(phrase_in)) raise Exception return w_context_vect
def load_bert(self): from bert_embedding import BertEmbedding import mxnet as mx bert = BertEmbedding(model='bert_12_768_12', dataset_name='wiki_cn_cased', ctx=mx.gpu(0)) return bert
class Embedder: def __init__(self, max_seq_length, batch_size=32): self.embedder = BertEmbedding(max_seq_length) self.batch_size = batch_size def fit(self, X, y): return self def transform(self, X): #result = [] #for i, row in enumerate(X): # embedding = self.embedder.project(row) # result.append(embedding) # batching result = [] i = 0 while i < len(X): print("start processing {} / {}".format(i, len(X))) batch = X[i:(i + self.batch_size)] embedding = self.embedder.project_batch(batch) result += embedding i += self.batch_size return np.array(result)
def predictor(model_path, arg1, arg2=None, originalArg1=None, originalArg2=None, ADU=False, verbose=1, fullText=None): """Generates model readable data from propositions to predict""" if verbose > 0: print('Start loading resources...') model = load_model(model_path) bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased', max_seq_length=35) if verbose > 0: print('Generate prediction...') fulls = [fullText.replace('\n', ' ')] * len(arg1) data = data_builder.generate_data_with_features(arg1, arg2, originalArg1, originalArg2, fulls, bert_embedding) if not ADU: features = model_builder.select_FFNN_features( data, shared_feature_list=None, original_bert=True) prediction = model.predict(features) if verbose > 0: print(prediction) return prediction else: features = model_builder.select_FFNN_features( data, shared_feature_list=None, original_bert=True, has_2=False) prediction = model.predict(features) if verbose > 0: print(prediction) return prediction
def _get_bert_embed(tokens): embedding = BertEmbedding().embedding(sentences=tokens) word_array = [] for i in range(len(embedding)): word_array.append(embedding[i][1][0]) return word_array
def fit(self, X, y=None): if self.bert_args['bert_model'] is not None and self.bert_args[ 'bert_dataset_name'] is not None: if self.bert_args['ctx'] is not None: self.ft_model = BertEmbedding( model=self.bert_args['bert_model'], dataset_name=self.bert_args['bert_dataset_name'], ctx=self.bert_args['ctx']) else: self.ft_model = BertEmbedding( model=self.bert_args['bert_model'], dataset_name=self.bert_args['bert_dataset_name']) else: self.ft_model = BertEmbedding() return self
def gen_features(X, wday, yday, i, j, n, tf=None, u=None, n_features=1400): from sklearn.feature_extraction import FeatureHasher from sklearn.decomposition import TruncatedSVD from scipy.spatial.distance import cosine from collections import Counter import numpy from bert_embedding import BertEmbedding import mxnet as mx ctx = mx.gpu(0) bert_embedding = BertEmbedding(ctx=ctx) result = bert_embedding(X) if tf is None: tf = Counter() for r in result: tf.update(r[0]) N = sum(tf.values()) h = FeatureHasher(n_features=n_features, input_type="string") def s_from_w(s): words = s[0] embedding = numpy.array(s[1]) embedding = numpy.concatenate( (embedding, h.transform(words).toarray()), axis=1) weight = numpy.array([1 / (1 + tf[x] / N) / len(words) for x in words]) return weight.dot(embedding) SX = numpy.array([s_from_w(x) for x in result]) if u is None: svd = TruncatedSVD(n_components=1, n_iter=8, random_state=42) svd.fit(SX) u = svd.components_ v2 = SX - SX.dot(u.transpose()) * u wday = numpy.array(wday, ndmin=2) yday = numpy.array(yday, ndmin=2) max_sim = wday * 0 for K, _ in enumerate(max_sim): if i[K] > 0: max_sim[K] = max((1 - cosine(SX[K, :], SX[K2, :]))**2 for K2 in range(K) if j[K] == j[K2]) i = numpy.array(i, ndmin=2) i_scaled = i / numpy.array(n, ndmin=2) return numpy.hstack( (wday.T, yday.T, i.T, i_scaled.T, max_sim.T, v2)), tf, u
def create_embedding(name): if name == "BERT": return BertEmbedding(max_seq_length=100, model='bert_12_768_12', # COMMENT: will we ever change max_seq_length? dataset_name='book_corpus_wiki_en_cased') # elif name == "FastText": # return FastText() else: print(f"ERROR: Unknown embedding type '{name}'! Supported embeddings: 'BERT' and 'FastText'.") exit(-1)
def get_bert_embedding(self): '''Creates word embeddings taken from BERT language representation returns: list, list Returns the BERT embeddings of the tokens of two sentences''' bert_embedding = BertEmbedding().embedding( sentences=self.tokenized_sent) return bert_embedding
def wordlist_to_bert(wordlist_file): x_list = np.load(wordlist_file, allow_pickle=True) bert = BertEmbedding(max_seq_length=PADDING_LEN) bert_list = [] for sentence in x_list: words, vectors = zip(*bert(sentence)) flat_vectors = [item for sublist in vectors for item in sublist] bert_list.append(np.squeeze(np.asarray(flat_vectors))) #print(bert_list[-1].shape) return bert_list
def create_sent_embd(df): from bert_embedding import BertEmbedding bert_embedding = BertEmbedding() l = list(df['statement_filt']) result = bert_embedding(l) from tqdm import tqdm_notebook as tqdm vect = [] for i in tqdm(range(len(result))): vect.append(result[i][1]) return vect
def saveBERT(embedding_file, data_file='../data/FrameTerms_refined.ibo'): bert = BertEmbedding(max_seq_length=100) f = open(data_file) data = f.read().strip('\n\n') data = data.split('\n\n') f.close() sentences=[] ids={} for sentence in data: words=[] terms= sentence.split('\n') if len(terms)<2: continue for wordVector in terms: word = wordVector.split(' ')[1] words.append(word) text=str.join(' ', words) if len(text)>0: sentences.append(text) ids[len(ids)]= text print("Data loaded...") #bert_encoding = bert(sentences) print("Data processed...") bert_embeddings={} #f = open(embedding_file, 'w') f = open(embedding_file) prev = f.readlines() f.close() for i in range(0, len(ids), 50): bert_encoding = bert(sentences[i:i+50]) print(str(i)+'/'+str(len(ids))) for j in range(50): #id= ids[i+j] #bert_embed= bert(sentences[i])[0][1] if j<len(bert_encoding): bert_embed= bert_encoding[j][1] x_tensor = torch.tensor(bert_embed, dtype=torch.float) vector = x_tensor.tolist() #f.write(id+'\t'+vector+'\n') bert_embeddings[i+j]= vector f= open(embedding_file, 'w') for line in prev: f.write(line) for i in range(len(ids)): f.write(str(bert_embeddings[i])+'\n') f.close() f=open('../data/embeddings/bert/indexer.json') indexer=json.load(f) f = open('../data/embeddings/bert/indexer2.json', 'w') for i in range(len(indexer)): f.write(indexer[i]+'\n') for s in sentences: f.write(s+'\n') f.close()
def pa_to_sen(self): """ input paragraph in string output list of m by 2 where m is number of sentences n= [0] element is txt, n=[1] is 768 by w where w is number of words """ bert_embedding = BertEmbedding() self.sentences = self.description.split('. ') self.sent_bert = bert_embedding(self.sentences) self.num_sen = len(self.sent_bert) return self.sent_bert, self.num_sen, self.sentences
def build(self): # build word embedding #in_id = Input(shape=(self._bretMaxLen,), name="input_ids") #in_mask = Input(shape=(self._bretMaxLen,), name="input_masks") #in_segment = Input(shape=(self._bretMaxLen,), name="segment_ids") #inputs = [in_id, in_mask, in_segment] #word_embeddings = BertLayer(n_fine_tune_layers=3,bert_path=self._bert_path)(inputs) word_ids = Input(batch_shape=(None, None), dtype='int32', name='word_input') word_embeddings = BertEmbedding( model='bert_24_1024_16', dataset_name='book_corpus_wiki_en_cased')(word_ids) # build character based word embedding # if self._use_char: # print("char Embedding layer On") # char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input') # inputs.append(char_ids) # char_embeddings = Embedding(input_dim=self._char_vocab_size, # output_dim=self._char_embedding_dim, # mask_zero=True, # name='char_embedding')(char_ids) # char_embeddings = TimeDistributed(Bidirectional(LSTM(self._char_lstm_size)))(char_embeddings) # word_embeddings = Concatenate()([word_embeddings, char_embeddings]) # # word_embeddings = Dropout(self._dropout)(word_embeddings) z = Bidirectional( LSTM(units=self._word_lstm_size, return_sequences=True, dropout=self._layerdropout, recurrent_dropout=self._layerdropout))(word_embeddings) if (self._layer2Flag): z = Bidirectional( LSTM(units=self._word_lstm_size, return_sequences=True, dropout=self._layerdropout, recurrent_dropout=self._layerdropout))(z) z = Dense(self._fc_dim, activation='tanh')(z) if self._use_crf: crf = CRF(self._num_labels, sparse_target=False) loss = crf.loss_function pred = crf(z) else: loss = 'categorical_crossentropy' pred = Dense(self._num_labels, activation='softmax')(z) model = Model(inputs=inputs, outputs=pred) return model, loss
def test_bert_embedding(): bert_abstract = """We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT representations can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE benchmark to 80.4% (7.6% absolute improvement), MultiNLI accuracy to 86.7 (5.6% absolute improvement) and the SQuAD v1.1 question answering Test F1 to 93.2 (1.5% absolute improvement), outperforming human performance by 2.0%.""" sentences = bert_abstract.split('\n') bert_embedding = BertEmbedding() embedding = bert_embedding(sentences) assert len(embedding) == 5 assert len(embedding[0]) == 2 assert len(embedding[0][0]) == 18
def bert(self, prep_obj): bert_embedding = BertEmbedding( model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased') result = np.array(bert_embedding(prep_obj.detokenized_corpus)) vec = np.zeros(768) for sentence in result: for word_vec in sentence[1]: vec = np.add(vec, np.array(word_vec)) vec = np.true_divide( vec, 1 if len(sentence[1]) == 0 else len(sentence[1])) self.vector_corpus.append(vec)
def generateData(corpus_file, classes, split_percentage, load_embedding_from_file=False, save_embedding_dict=False, verbose=True, embedding_dict_filename="embedding_dicts/embedding_dict.pkl", shuffle=True, ignore_words=None): if ignore_words is None: ignore_words = [] sentences = [] # Read in corpus if verbose: print("Loading corpus...") with open(corpus_file, 'r') as f: for line in f: if any(w.lower() in line.lower() for w in classes): sentences.append(line.strip().replace('-', ' ')) if shuffle: random.shuffle(sentences) #sentences = sentences[0:50] if verbose: print("Computing Bert Embeddings...") # split into train and test sets num_train = int(len(sentences)*split_percentage) train_set = sentences[0:num_train] test_set = sentences[num_train:] # Create dictionary of training/test sets if verbose: print(len(train_set), len(test_set)) words = [w.lower() for line in train_set + test_set for w in line.split()] words = list(set(words)) bert = BertEmbedding() if load_embedding_from_file: with open(embedding_dict_filename, 'rb') as f: embedding_dict = pickle.load(f) else: embedding_dict = dict([(x[0], y[0]) for x, y in bert(words)]) if save_embedding_dict: with open(embedding_dict_filename, 'wb') as f: pickle.dump(embedding_dict, f, pickle.HIGHEST_PROTOCOL) if verbose: print("Preparing dataset...") def create_dataset(dataset, max_len): data = [] labels = [] for sentence in dataset: for i in range(len(classes)): word = classes[i] if word.lower() in sentence.lower(): s = sentence.lower().split() x = [np.array(embedding_dict[w]) for w in s if w not in word.lower() and w not in ignore_words and w in embedding_dict] x = np.array(x) max_len = max_len if x.shape[0] < max_len else x.shape[0] x = torch.tensor(x, dtype=torch.float) x = x.unsqueeze(0) data.append(x) y = [i] y = torch.tensor(y, dtype=torch.long) labels.append(y) return data, labels, max_len train_set, train_labels, max_len = create_dataset(train_set, 0) test_set, test_labels, max_len = create_dataset(test_set, max_len) return train_set, train_labels, test_set, test_labels, classes
def get_bert_embs(texts, label): ctx = mx.gpu(0) bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='wiki_multilingual_cased', max_seq_length=2048, ctx=ctx) embs_list = [] length = len(texts) if len(texts) <= 10000 else 10000 for i in tqdm(range(0, length)): if type(texts[i]) is not float: result = bert_embedding([texts[i]]) embs_list.append(result[0][1][1:-1]) with open(f'../data/reviews-eng/embs_full/embs_{label}.bin', 'wb') as file: pickle.dump(embs_list, file)
def tree_to_bert(tree_labels, verbose=False): bert = BertEmbedding(max_seq_length=PADDING_LEN) x_list = [] y_list = [] for t in tree_labels: y, x = t.to_labeled_lines()[0] y_list.append(y) str_list, arr_list = zip(*bert([x])) if verbose: print(str_list) x_list.append(np.squeeze(np.asarray(arr_list))) # print(x_list[-1].shape) return x_list, y_list
class Embedder: def __init__(self, max_seq_length, batch_size): self.batch_size = batch_size if batch_size == 1: self.embedder = BertEmbedding(512) print("seq length set to Bert maximum 512 when batch size is 1") else: self.embedder = BertEmbedding(max_seq_length) def fit(self, X, y): return self def transform(self, X): if self.batch_size == 1: return self.__transform_in_single(X) else: return self.__transform_in_batch(X) def __transform_in_single(self, X): result = [] for i, row in enumerate(X): embedding = self.embedder.project(row) result.append(embedding) return np.array(result) def __transform_in_batch(self, X): result = [] i = 0 while i < len(X): print("start processing {} / {}".format(i, len(X))) batch = X[i:(i + self.batch_size)] embedding = self.embedder.project_batch(batch) result += embedding i += self.batch_size return np.array(result)