def createTokenizer(): currentDir = "/home/puneet/code/Interspeech/" #os.path.dirname(os.path.realpath('/content/drive/MyDrive/iemocap(version2)/')) modelsFolder = os.path.join(currentDir, "iemocap(version2)/model", "multi_cased_L-12_H-768_A-12") vocab_file = os.path.join(modelsFolder, "vocab.txt") tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True) return tokenizer
def getDeepBias(text): global mod print(text) text.replace('\n', ' ') tokenizer = bert_tokenization.FullTokenizer(vocab_file='vocab.txt', do_lower_case=False) tokens = tokenizer.tokenize(text) token_segments = [] index = 382 tmp = ['[CLS]'] + tokens[:382] + ['[SEP]'] tmp = np.array(tmp) tmp = tokenizer.convert_tokens_to_ids(tmp) while len(tmp) < 384: tmp.append(0) tmp = np.array(tmp) #print(tmp) #print(tmp.shape) #print('predicted') token_segments.append(tmp) while (index < len(tokens)): index += 382 temp = ['[CLS]'] + tokens[index - 382:index] + ['[SEP]'] temp = tokenizer.convert_tokens_to_ids(temp) if len(temp) > 100: while len(temp) < 384: temp.append(0) temp = np.array(temp) token_segments.append(temp) token_segments = np.array(token_segments) print(token_segments.shape) preds = [] for t in token_segments: print(t) preds.append(mod.predict(t.reshape(1, 384))) avg = [0, 0] for i in preds: avg += i avg /= len(preds) print("avg:" + str(avg)) return avg[0]
def test_compare(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = bert_tokenization.FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 16 input_str = "hello, bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [0] * len(input_tokens) + [0] * ( max_seq_len - len(input_tokens) ) # FIXME: input_mask broken - chane to [1]* token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) bert_1_seq_out = CompareBertActivationsTest.predict_on_stock_model( model_dir, input_ids, input_mask, token_type_ids) bert_2_seq_out = CompareBertActivationsTest.predict_on_keras_model( model_dir, input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("stock bert res", bert_1_seq_out.shape) print("keras bert res", bert_2_seq_out.shape) print("stock bert res:\n {}".format(bert_1_seq_out[0, :2, :10]), bert_1_seq_out.dtype) print("keras bert_res:\n {}".format(bert_2_seq_out[0, :2, :10]), bert_2_seq_out.dtype) abs_diff = np.abs(bert_1_seq_out - bert_2_seq_out).flatten() print("abs diff:", np.max(abs_diff), np.argmax(abs_diff)) self.assertTrue(np.allclose(bert_1_seq_out, bert_2_seq_out, atol=1e-6))
def test_finetune(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = bert_tokenization.FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 24 input_str_batch = ["hello, bert!", "how are you doing!"] input_ids_batch = [] token_type_ids_batch = [] for input_str in input_str_batch: input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] print("input_tokens len:", len(input_tokens)) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * ( max_seq_len - len(input_tokens)) input_ids_batch.append(input_ids) token_type_ids_batch.append(token_type_ids) input_ids = np.array(input_ids_batch, dtype=np.int32) token_type_ids = np.array(token_type_ids_batch, dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) model = CompareBertActivationsTest.load_keras_model( model_dir, max_seq_len) model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.mean_squared_error) pres = model.predict([input_ids, token_type_ids ]) # just for fetching the shape of the output print("pres:", pres.shape) model.fit(x=(input_ids, token_type_ids), y=np.zeros_like(pres), batch_size=2, epochs=2)
def __init__(self, args, logger=None): self.args = args self.name = 'BERT' self.logger = logger self.manual_seed = args.seed self.max_seq_length = args.max_seq_length self.datapath = args.datapath self.bert_model_file = os.path.join(self.datapath, 'pretrained_models/bert/') self.vocab_file = os.path.join(self.bert_model_file, 'vocab.txt') self.lower_case = True self.learning_rate = args.learning_rate self.finetuning_rate = args.finetuning_rate self.model_dir = args.logdir self.tokenizer = bert_tokenization.FullTokenizer( vocab_file=self.vocab_file, do_lower_case=self.lower_case) self.num_supervised_trials = args.num_supervised_trials self.sup_batch_size = args.train_batch_size self.sup_epochs = args.num_epochs self.unsup_epochs = args.num_unsup_epochs self.T = args.T
def __init__(self): max_seq_len = 128 title_col = 'title' text_col = 'text' label_col = 'type' model_dir = path.join(path.dirname(path.abspath(__file__)), 'uncased_L-12_H-768_A-12') print('loading bert data...') train_data = pd.read_csv("../CNN_data/all_data.csv") tokenizer = bert_tokenization.FullTokenizer(vocab_file=path.join(model_dir, "vocab.txt")) input_tokens = [] input_labels = [] print('tokenizing bert data...') for _, row in train_data.iterrows(): text, title, label = row[text_col], row[title_col], row[label_col] total_text = text + title tokens = tokenizer.tokenize(total_text) tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) token_ids = token_ids[:min(len(token_ids), max_seq_len)] token_ids = token_ids + [0] * (max_seq_len - len(token_ids)) input_tokens.append(token_ids) input_labels.append(label) print('loaded and processed bert data!') doubles = list(zip(input_tokens, input_labels)) np.random.shuffle(doubles) input_tokens, input_labels = zip(*doubles) print('shuffled bert data') self.train_data = np.array(input_tokens[:int(len(input_tokens) * 0.75)]) self.test_data = np.array(input_tokens[int(len(input_tokens) * 0.75):]) self.train_labels = np.array(input_labels[:int(len(input_labels) * 0.75)]) self.test_labels = np.array(input_labels[int(len(input_labels) * 0.75):]) self.max_seq_len = max_seq_len
y.append(int(label)) pbar.update() return np.array(x), np.array(y) def _pad(self, ids): x, t = [], [] token_type_ids = [0] * self.max_seq_len for input_ids in ids: input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)] input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids)) x.append(np.array(input_ids)) t.append(token_type_ids) return np.array(x), np.array(t) tokenizer = bert_tokenization.FullTokenizer( vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt")) data = MovieReviewData( tokenizer, sample_size=10 * 128 * 2, #5000, max_seq_len=128) ##EDA for training data print(" train_x", data.train_x.shape) print("train_x_token_types", data.train_x_token_types.shape) print(" train_y", data.train_y.shape) print(" test_x", data.test_x.shape) print(" max_seq_len", data.max_seq_len)
# prepare class encoder le = ce.OneHotEncoder(return_df=False, handle_unknown="ignore") # labels = le.fit(list(df['id'])) mapa = [0, 1] labels_map = [0, 1] # i = 0 # for a in mapa: # labels_map.append(a) # print(labels_map) # Tokenization # Inizialize the tokenizer from bert import bert_tokenization tokenizer = bert_tokenization.FullTokenizer(vocab_path, do_lower_case=True) # tokenizer = tokenization.FullTokenizer(vocab_path, do_lower_case=True) # indices_train = [] indices_test = [] # for text in train['Desc']: # tk = tokenizer.tokenize(text) # tokens = ["[CLS]"] + tk + ["[SEP]"] # token_ids = tokenizer.convert_tokens_to_ids(tokens) # token_ids = _pad(token_ids,SEQ_LEN) # indices_train.append(token_ids) for text in test['Desc']: tk = tokenizer.tokenize(text) tokens = ["[CLS]"] + tk + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens)
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.utils import class_weight base_model = hub.load( 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1') bert_layer = hub.KerasLayer(base_model) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case) MAX_SEQ_LEN = 65 TEST_SIZE = 0.2 LR = 1e-4 N_EPOCHS = 5 BATCH_SIZE = 32 df = pd.read_csv('./data/clean_train.csv', index_col=False) df.dropna(inplace=True) df.reset_index(drop=True, inplace=True) print("Split Data") X_data = df['text'][:5].to_numpy() y_data = df['target'][:5].to_numpy() y_data = y_data.reshape(-1, 1)
def load_dataset(dataset_path, model_path, vocab_file, max_seq_len=None, return_len=False): df = pd.read_csv(dataset_path, sep='\t', compression='infer', header=None, index_col=None) df.columns = ['pmid', 'paragraph', 'sentence', 'in_sent_id', 'entity1', 'entity2', 'old_sent', 'class', 'distance', 'sample_sentence'] sentences = list(df['sample_sentence']) labels = list(df['class']) vocab_path = model_path + vocab_file tokenizer = bert_tokenization.FullTokenizer(vocab_path, False) i_ent_tag = re.compile('<I>') o_ent_tag = re.compile('<O>') sentences_tokens = list() # [[1,2,3], [5,2,3]] entity_position = list() # [[(1,2), (2,3)]] for sent in sentences: bert_tokens = list() bert_target_indices = list() split_sent = sent.split('<S>') bert_tokens.append('[CLS]') for split in split_sent: if i_ent_tag.findall(split): start = len(bert_tokens) cur_split = i_ent_tag.sub('', split) word_pieces = tokenizer.tokenize(cur_split) bert_tokens.extend(word_pieces) end = len(bert_tokens) bert_target_indices.append([start, end]) elif o_ent_tag.findall(split): cur_split = o_ent_tag.sub('', split) word_pieces = tokenizer.tokenize(cur_split) bert_tokens.extend(word_pieces) else: cur_split = split word_pieces = tokenizer.tokenize(cur_split) bert_tokens.extend(word_pieces) bert_tokens.append('[SEP]') sample_ids = tokenizer.convert_tokens_to_ids(bert_tokens) sentences_tokens.append(sample_ids) bert_target_indices.sort() entity_position.append(bert_target_indices) if max_seq_len is not None: all_len = np.array([len(item) for item in sentences_tokens]) is_shorter_than_max = all_len <= max_seq_len x = np.array(pad_sequences(sequences=sentences_tokens, maxlen=max_seq_len, padding="post"))[is_shorter_than_max, :] ent1_position = np.array([pair[0] for pair in entity_position])[is_shorter_than_max, :] ent2_position = np.array([pair[1] for pair in entity_position])[is_shorter_than_max, :] y = np.array(labels)[is_shorter_than_max] df = df.loc[is_shorter_than_max, :] if return_len: return df, (x, ent1_position, ent2_position, all_len[is_shorter_than_max]), y else: return df, (x, ent1_position, ent2_position), y else: x = np.array(sentences_tokens) ent1_position = np.array([pair[0] for pair in entity_position]) # start-end for slicing ent2_position = np.array([pair[1] for pair in entity_position]) y = np.array(labels) return df, (x, ent1_position, ent2_position), y
def __init__(self, vocab_file, max_seq_len): self.vocab_file = vocab_file self.max_seq_len = max_seq_len self.tokenizer = bert_tokenization.FullTokenizer(vocab_file)
def get_BERT_Tokenizer(): path = os.getcwd()[:os.getcwd().rfind('/')] + '/deeplearning/' vocab_file = path + 'uncased_L-12_H-768_A-12' + '/vocab.txt' tokenizer = bert_tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) return tokenizer