def __init__(self, embedding_dir, model_name="bert-base-multilingual-cased", layer=-2): super(BertEncoder, self).__init__(embedding_dir) # Load pre-trained model (weights) and set to evaluation mode (no more training) self.model = BertModel.from_pretrained(model_name) self.model.eval() # Load word piece tokenizer self.tokenizer = BertTokenizer.from_pretrained(model_name) # Layer from which to get the embeddings self.layer = layer
vocab, embd = data_helpers.build_vocab(FLAGS.dataset, FLAGS.pretrained_embeddings_path) if len(FLAGS.pretrained_embeddings_path) > 0: assert (embd.shape[1] == FLAGS.embedding_dim) with open('{}/embd.pkl'.format(FLAGS.dataset), 'wb') as fout: pickle.dump(embd, fout) with open('{}/vocab.pkl'.format(FLAGS.dataset), 'wb') as fout: pickle.dump(vocab, fout) alist = data_helpers.read_alist_standalone(FLAGS.dataset, "vocab.txt", FLAGS.max_sequence_length_a, FLAGS.padding) raw, raw_dict = data_helpers.read_raw_bert(FLAGS.dataset) devList = data_helpers.loadTestSet(FLAGS.dataset, "valid.data") testList = data_helpers.loadTestSet(FLAGS.dataset, "test.data") testallList = data_helpers.loadTestSet(FLAGS.dataset, "test.data") # testall tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) print("Load done...") if not os.path.exists('./log/'): os.mkdir('./log/') log_precision = 'log/{}.test.gan_precision.{}.log'.format( FLAGS.prefix, timeStamp) log_loss = 'log/{}.test.gan_loss.{}.log'.format(FLAGS.prefix, timeStamp) def log_time_delta(func): @wraps(func) def _deco(*args, **kwargs): start = time.time() ret = func(*args, **kwargs) end = time.time()
import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # 파일 읽어 오기 with open('train.txt', mode='r', encoding='utf-8') as file: sentence = list() raw_sentence = list() category = list() while True: data = file.readline() if data: category.append(data.split('\t')[0]) sentence.append(data.split('\t')[1]) raw_sentence.append(data.split('\t')[1]) else: break tokenized_input_text = list() indexed_input_text = list() tokenized_output_text = list() indexed_output_text = list() for (sentence1, sentence2) in zip(sentence, category): sentence1 = "[CLS] " + sentence1 + " [SEP]" sentence2 = "[CLS] " + sentence2 + " [SEP]"
import numpy as np import pandas as pd from pprint import pprint from typing import Tuple, Callable, List import pickle import json from tqdm import tqdm from collections import OrderedDict import re from pathlib import Path from pytorch_pretrained_bert import BertTokenizer from vocab import Vocabulary from pad_sequence import keras_pad_fn, my_pad bert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") class NamedEntityRecognitionDataset(Dataset): def __init__(self, train_data_dir: str, vocab, tokenizer=bert_tokenizer, maxlen=30, model_dir=Path('data_in')) -> None: """ :param train_data_in: :param transform_fn: """ self.model_dir = model_dir
model.to(device) test_data = pd.read_table('../cnews/cnews.test.txt', encoding='utf-8', names=['label', 'text']) le = LabelEncoder() le.fit(test_data.label.tolist()) # 标签ID化 test_data['label_id'] = le.transform(test_data.label.tolist()) labels_data = test_data.groupby(['label', 'label_id']).count().reset_index() labels_map = labels_data[['label', 'label_id']] test_data = test_data[['text', 'label_id']] # 转换为tensor # 分词工具 bert_tokenizer = BertTokenizer.from_pretrained('../chinese_wwm_ext_pytorch', do_lower_case=False) # 类初始化 processor = DataPrecessForSingleSentence(bert_tokenizer=bert_tokenizer) test_seqs, test_seq_masks, test_seq_segments, test_labels = processor.get_input( dataset=test_data) test_seqs = torch.tensor(test_seqs, dtype=torch.long) test_seq_masks = torch.tensor(test_seq_masks, dtype=torch.long) test_seq_segments = torch.tensor(test_seq_segments, dtype=torch.long) test_labels = torch.tensor(test_labels, dtype=torch.long) test_data = TensorDataset(test_seqs, test_seq_masks, test_seq_segments, test_labels) test_dataloder = DataLoader(dataset=test_data, batch_size=192) # 用于存储预测标签与真实标签 true_labels = [] pred_labels = [] model.eval()
def preprocessor( data_dir, task_name, split, bert_model_name="bert-base-uncased", max_data_samples=None, max_sequence_length=128, ): sentences, labels = parse_tsv(data_dir, task_name, split, max_data_samples) labels = torch.from_numpy(np.array(labels)) do_lower_case = "uncased" in bert_model_name tokenizer = BertTokenizer.from_pretrained( bert_model_name, do_lower_case=do_lower_case ) bert_token_ids = [] bert_token_masks = [] bert_token_segments = [] for sentence in sentences: if len(sentence) not in [1, 2]: logger.error("Sentence length doesn't match.") # Tokenize sentences tokenized_sentence = [tokenizer.tokenize(sent) for sent in sentence] sent1_tokens = tokenized_sentence[0] sent2_tokens = tokenized_sentence[1] if len(tokenized_sentence) == 2 else None # One sentence case if len(tokenized_sentence) == 1: # Remove tokens that exceeds the max_sequence_length if len(sent1_tokens) > max_sequence_length - 2: # Account for [CLS] and [SEP] with "- 2" sent1_tokens = sent1_tokens[: max_sequence_length - 2] # Two sentences case else: # Remove tokens that exceeds the max_sequence_length while True: total_length = len(sent1_tokens) + len(sent2_tokens) # Account for [CLS], [SEP], [SEP] with "- 3" if total_length <= max_sequence_length - 3: break if len(sent1_tokens) > len(sent2_tokens): sent1_tokens.pop() else: sent2_tokens.pop() # Convert to BERT manner tokens = ["[CLS]"] + sent1_tokens + ["[SEP]"] token_segments = [0] * len(tokens) if sent2_tokens: tokens += sent2_tokens + ["[SEP]"] token_segments += [1] * (len(sent2_tokens) + 1) token_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate mask where 1 for real tokens and 0 for padding tokens token_masks = [1] * len(token_ids) bert_token_ids.append(torch.LongTensor(token_ids)) bert_token_masks.append(torch.LongTensor(token_masks)) bert_token_segments.append(torch.LongTensor(token_segments)) return bert_token_ids, bert_token_segments, bert_token_masks, labels
def main(): # train_df = pd.read_csv(TRAIN_PATH).sample(frac=1.0, random_state=seed) # train_size = int(len(train_df) * 0.9) train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size, random_state=seed) LOGGER.info(f'data_size is {len(train_df)}') LOGGER.info(f'train_size is {train_size}') y = np.where(train_df['target'] >= 0.5, 1, 0) y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") sample_weights = np.ones(len(train_df), dtype=np.float32) sample_weights += train_df[identity_columns_new].sum(axis=1) sample_weights += train_df['target_bin'] * ( ~train_df[identity_columns_new]).sum(axis=1) sample_weights += (~train_df['target_bin'] ) * train_df[identity_columns_new].sum(axis=1) * 5 sample_weights /= sample_weights.mean() with timer('preprocessing text'): # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) test_df = train_df[train_size:] with timer('train'): X_train, y_train, y_aux_train, w_train = X_text[: train_size], y[: train_size], y_aux[: train_size], sample_weights[: train_size] X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[ train_size:], y_aux[train_size:], sample_weights[train_size:] model = BertForSequenceClassification.from_pretrained( WORK_DIR, cache_dir=None, num_labels=n_labels) model.zero_grad() model = model.to(device) train_dataset = torch.utils.data.TensorDataset( torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) sample_weight_train = [w_train.values, np.ones_like(w_train)] sample_weight_val = [w_val.values, np.ones_like(w_val)] param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) criterion = torch.nn.BCEWithLogitsLoss().to(device) LOGGER.info(f"Starting 1 epoch...") tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_dic'.format(exp)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred.reshape(-1) test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.legend() plt.xticks(xs) plt.xlabel('Iter') plt.savefig("loss.png")
def prewin(path): dirname = os.path.dirname(path) name = os.path.basename(path) rawname = os.path.splitext(name)[0] # without extension if 'lit' in name or 'literal' in name or 'LOCATION' in name: label = 0 else: if 'met' in name or 'metonymic' in name or 'mixed' in name: label = 1 # 1 is for METONYMY/NON-LITERAL, 0 is for LITERAL elif 'INSTITUTE' in name: label = 1 elif 'TEAM' in name: label = 2 elif 'ARTIFACT' in name: label = 3 elif 'EVENT' in name: label = 4 bert_version = 'bert-base-uncased' model = BertModel.from_pretrained(bert_version) model.eval() spacy_tokenizer = English(parser=False) bert_tokenizer = BertTokenizer.from_pretrained(bert_version) en_nlp = spacy.load('en') inp = codecs.open(path, mode="r", encoding="utf-8") # PLEASE FORMAT THE INPUT FILE AS ONE SENTENCE PER LINE. SEE BELOW: # ENTITY<SEP>sentence<ENT>ENTITY<ENT>rest of sentence. # Germany<SEP>Their privileges as permanent Security Council members, especially the right of veto, # had been increasingly questioned by <ENT>Germany<ENT> and Japan which, as major economic powers. out = [] seq_length = 5 # A window of 5 is the DEFAULT for the PUBLICATION methodology. Feel free to experiment. for line in inp: line = line.split(u"<SEP>") sentence = line[1].split(u"<ENT>") entity = [t.text for t in spacy_tokenizer(sentence[1])] en_doc = en_nlp(u"".join(sentence).strip()) words = [] index = locate_entity(en_doc, entity, spacy_tokenizer(sentence[0].strip()), spacy_tokenizer(sentence[2].strip())) start = find_start(en_doc[index]) # -------------------------------------------------------------------- # Token map will be an int -> int mapping # between the `spacy_tokens` index and the `bert_tokens` index. spacy_to_bert_map = [] bert_tokens = [] spacy_tokens = [token.text for token in en_doc] ''' According to https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ [CLS] amd [SEP] tokens are important. Also, use the segment_ids to inform BERT that the input is just one sentence. ''' spacy_tokens = ["[CLS]"] + spacy_tokens + ["[SEP]"] for orig_token in spacy_tokens: spacy_to_bert_map.append(len(bert_tokens)) bert_tokens.extend(bert_tokenizer.tokenize(orig_token)) segments_ids = [1] * len(bert_tokens) try: token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = torch.tensor([token_ids]) segments_tensors = torch.tensor([segments_ids]) with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors, output_all_encoded_layers=True) ''' According to http://jalammar.github.io/illustrated-bert/ concatenating the last four hidden four layers is a good choice as a contextualised ELMo-like word embeddings. Concatenation leads to very long tensors. So I decided to take sum of the last four hiddden layers. This is the second best approach according to the blog. ''' bert_emb = torch.add(encoded_layers[-1], encoded_layers[-2]).add( encoded_layers[-3]).add(encoded_layers[-4]).squeeze() bert_emb_length = bert_emb.shape[-1] ''' Perform summation of subword embeddings to compute word embeddings Another choice is to compute the average of the subword embeddings. Concatenation is obviously not a good choice here. Source: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/ Here, we perform summation of subword embeddings. ''' cond_bert_emb = torch.zeros(len(spacy_tokens), bert_emb_length) for spacy_index in range(len(spacy_tokens)): start_bert_index = spacy_to_bert_map[spacy_index] try: end_bert_index = spacy_to_bert_map[spacy_index + 1] except IndexError: end_bert_index = len(bert_tokens) for foo in range(start_bert_index, end_bert_index): cond_bert_emb[spacy_index] = cond_bert_emb[ spacy_index].add(bert_emb[foo]) except ValueError: cond_bert_emb = torch.zeros(len(spacy_tokens), 768) print('ValueError Exception caught!') ''' Since the two special tokens are added, strip bert embeddings appropriately. Now bert embeddings are in sync in spacy parse. ''' cond_bert_emb = cond_bert_emb[1:-1] assert (len(cond_bert_emb) == len(en_doc)) # -------------------------------------------------------------------- left = seq_length * ["0.0"] right = seq_length * ["0.0"] dep_left = seq_length * ["0.0"] dep_right = seq_length * ["0.0"] bert_left = torch.zeros((seq_length, bert_emb_length)) bert_right = torch.zeros_like(bert_left) if start.i > index: if index + 1 < len(en_doc) and en_doc[index + 1].dep_ in [u"case", u"compound", u"amod"] \ and en_doc[index + 1].head == en_doc[index]: # any neighbouring word that links to it right = pad( [en_doc[index + 1].text] + [t.text for t in en_doc[start.i:][:seq_length - 1]], False, seq_length) dep_right = pad([en_doc[index + 1].dep_] + [t.dep_ for t in en_doc[start.i:]][:seq_length - 1], False, seq_length) bert_right = bert_pad( torch.cat((torch.unsqueeze(cond_bert_emb[index + 1], 0), cond_bert_emb[start.i:][:seq_length - 1])), False, seq_length) else: right = pad([t.text for t in en_doc[start.i:][:seq_length]], False, seq_length) dep_right = pad([t.dep_ for t in en_doc[start.i:]][:seq_length], False, seq_length) bert_right = bert_pad(cond_bert_emb[start.i:][:seq_length], False, seq_length) else: if index - len(entity) >= 0 and en_doc[index - len(entity)].dep_ in [u"case", u"compound", u"amod"] \ and en_doc[index - len(entity)].head == en_doc[index]: # any neighbouring word that links to it left = pad( [t.text for t in en_doc[:start.i + 1][-(seq_length - 1):]] + [en_doc[index - len(entity)].text], True, seq_length) dep_left = pad( [t.dep_ for t in en_doc[:start.i + 1]][-(seq_length - 1):] + [en_doc[index - len(entity)].dep_], True, seq_length) bert_left = bert_pad( torch.cat( (cond_bert_emb[:start.i + 1][-(seq_length - 1):], torch.unsqueeze(cond_bert_emb[index - len(entity)], 0))), True, seq_length) else: left = pad( [t.text for t in en_doc[:start.i + 1][-seq_length:]], True, seq_length) dep_left = pad([t.dep_ for t in en_doc[:start.i + 1]][-seq_length:], True, seq_length) bert_left = bert_pad(cond_bert_emb[:start.i + 1][-seq_length:], True, seq_length) assert (bert_left.shape == bert_right.shape) assert (len(left) == len(dep_left) == len(bert_left)) assert (len(right) == len(dep_right) == len(bert_right)) out.append( (left, dep_left, bert_left, right, dep_right, bert_right, label)) #print(left, right) #print(dep_left, dep_right) #print(bert_left, bert_right) #print(label) #print(line[1]) print("Processed:{} lines/sentences.".format(len(out))) dump_to_hdf5("{}/bert_pickles/{}_prewin.hdf5".format(dirname, rawname), out)
def __init__(self, model_file='https://convlab.blob.core.windows.net/convlab-2/comer.zip', embed_file='https://convlab.blob.core.windows.net/convlab-2/comer_embed.zip'): super().__init__() parser = argparse.ArgumentParser(description='predict.py') parser.add_argument('-config', default='config.yaml', type=str, help="config file") parser.add_argument('-gpus', default=[0], nargs='+', type=int, help="Use CUDA on the listed devices.") parser.add_argument('-restore', default='data/log/norml_mwNestedNOND128NOsaA1FNN/checkpoint.pt', type=str, help="restore checkpoint") parser.add_argument('-seed', type=int, default=1234, help="Random seed") parser.add_argument('-model', default='seq2seq', type=str, help="Model selection") parser.add_argument('-score', default='', type=str, help="score_fn") parser.add_argument('-pretrain', action='store_true', help="load pretrain embedding") parser.add_argument('-limit', type=int, default=0, help="data limit") parser.add_argument('-log', default='predict', type=str, help="log directory") parser.add_argument('-unk', action='store_true', help="replace unk") parser.add_argument('-memory', action='store_true', help="memory efficiency") parser.add_argument('-beam_size', type=int, default=1, help="beam search size") self.root_path = os.path.dirname(os.path.abspath(__file__)) opt = parser.parse_args([]) config = utils.read_config(os.path.join(self.root_path, opt.config)) torch.manual_seed(opt.seed) use_cuda = True bert_type = 'bert-large-uncased' if os.path.exists(os.path.join(self.root_path, 'data/mwoz2_dm.dict')) and \ os.path.exists(os.path.join(self.root_path, 'data/mwoz2_sl.dict')) and \ os.path.exists(os.path.join(self.root_path, 'data/save_data.tgt.dict')) and \ os.path.exists(os.path.join(self.root_path, 'data/log/norml_mwNestedNOND128NOsaA1FNN/checkpoint.pt')): pass else: output_dir = os.path.join(self.root_path, 'data') if not os.path.exists(output_dir): os.makedirs(output_dir) print('Load from model_file param') archive_file = cached_path(model_file) archive = zipfile.ZipFile(archive_file, 'r') archive.extractall(self.root_path) archive.close() if not os.path.exists(os.path.join(self.root_path, 'data/emb_tgt_mw.pt')): output_dir = os.path.join(self.root_path, 'data') if not os.path.exists(output_dir): os.makedirs(output_dir) print('Load from embed_file param') archive_file = cached_path(embed_file) archive = zipfile.ZipFile(archive_file, 'r') archive.extractall(self.root_path) archive.close() self.sl_dict = json.load(open(os.path.join(self.root_path, 'data/mwoz2_sl.dict'))) self.dm_dict = json.load(open(os.path.join(self.root_path, 'data/mwoz2_dm.dict'))) self.tokenizer = BertTokenizer.from_pretrained(bert_type) self.vocab = torch.load(os.path.join(self.root_path, 'data/save_data.tgt.dict')) self.reversed_vocab = {i: j for j, i in self.vocab.items()} pretrain_embed = {} pretrain_embed['slot'] = torch.load(os.path.join(self.root_path, 'data/emb_tgt_mw.pt')) print('building model...\n') bmodel = BertModel.from_pretrained(bert_type) bmodel.eval() if use_cuda: bmodel.to('cuda') self.model = getattr(models, opt.model)(config, self.vocab, self.vocab, use_cuda, bmodel, pretrain=pretrain_embed, score_fn=opt.score) print('loading checkpoint...\n') print(os.path.join(self.root_path, opt.restore)) import sys sys.path.append(self.root_path) checkpoints = torch.load(os.path.join(self.root_path, opt.restore)) self.model.load_state_dict(checkpoints['model']) self.model.cuda() self.model.eval() self.state = None self.init_session()
config = Config( testing=False, bert_model_name="bert-base-chinese", #Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters max_lr=3e-5, #学习率 epochs=5, use_fp16=False, #fastai里可以方便地调整精度,加快训练速度:learner.to_fp16() bs=8, #batch size max_seq_len=128, #选取合适的seq_length,较大的值可能导致训练极慢报错等 ) from pytorch_pretrained_bert import BertTokenizer bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") #使用Bert分词器分词的适配器 class FastAiBertTokenizerAdapter(BaseTokenizer): """包装BertTokenizer为FastAI中的BaseTokenizer""" def __init__(self, tokenizer: BertTokenizer, max_seq_len: int = 128, **kwargs): self._pretrained_tokenizer = tokenizer self.max_seq_len = max_seq_len def __call__(self, *args, **kwargs): return self
def __init__(self, df, train_mode=True, labeled=True): super(ToxicDataset, self).__init__() self.df = df self.train_mode = train_mode self.labeled = labeled self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dev_data_file = open(path + 'en_ewt-ud-dev.conllu', 'r', encoding='utf-8') trainPos = joinParse(train_data_file) devPos = joinParse(dev_data_file) tagsTrain = list(set(word_pos[1] for sent in trainPos for word_pos in sent)) tag2idx = {tag: idx for idx, tag in enumerate(tagsTrain)} tag2idx['<pad>'] = -1 idx2tag = {idx: tag for idx, tag in enumerate(tagsTrain)} idx2tag[-1] = '<pad>' idx2tag[17] = '<pad>' device = 'cuda' if torch.cuda.is_available() else 'cpu' if args.large: tokenizer = BertTokenizer.from_pretrained( 'bert-large-cased', do_lower_case=False ) if case == 'case' else BertTokenizer.from_pretrained( 'bert-large-uncased') else: tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False ) if case == 'case' else BertTokenizer.from_pretrained('bert-base-uncased') model = Net(vocab_size=len(tag2idx), device=device, case=case) model.to(device) model = nn.DataParallel(model) train_dataset = PosDataset(trainPos) dev_dataset = PosDataset(devPos) for epoch in range(args.epochs):
def main(args): output_mode = OUTPUT_MODE cache_dir = CACHE_DIR # if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR): # REPORTS_DIR += '/report_%d' % (len(os.listdir(REPORTS_DIR))) # os.makedirs(REPORTS_DIR) # if not os.path.exists(REPORTS_DIR): # os.makedirs(REPORTS_DIR) # REPORTS_DIR += '/report_%d' % (len(os.listdir(REPORTS_DIR))) # os.makedirs(REPORTS_DIR) if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR): raise ValueError( "Output directory ({}) already exists and is not empty.".format( OUTPUT_DIR)) if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) processor = BinaryClassificationProcessor() train_examples = processor.get_train_examples(DATA_DIR) train_examples_len = len(train_examples) label_list = processor.get_labels() # [0, 1] for binary classification num_labels = len(label_list) num_train_optimization_steps = int( train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) label_map = {label: i for i, label in enumerate(label_list)} train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples] train_features = [] for train_example in train_examples_for_processing: train_features.append( convert_examples_to_features.convert_example_to_feature( train_example)) with open(DATA_DIR + "train_features.pkl", "wb") as f: pickle.dump(train_features, f) # Load pre-trained model (weights) model = BertForSequenceClassification.from_pretrained( BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=LEARNING_RATE, warmup=WARMUP_PROPORTION, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", train_examples_len) logger.info(" Batch size = %d", TRAIN_BATCH_SIZE) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE) model.train() for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm_notebook(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) label_weights = torch.FloatTensor(np.asarray([1.0, 2.0])) label_weights_tensor = torch.autograd.Variable( label_weights, volatile=True).cuda() loss_fct = CrossEntropyLoss(weight=label_weights_tensor) # loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) if GRADIENT_ACCUMULATION_STEPS > 1: loss = loss / GRADIENT_ACCUMULATION_STEPS loss.backward() # print("\r%f" % loss) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0: optimizer.step() optimizer.zero_grad() global_step += 1 print('Avg train loss:%.4f' % (tr_loss * 1.0 / nb_tr_steps)) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME) output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(OUTPUT_DIR)
def train(args, dataset, text_process, generator, discriminator, inception_score): step = int(math.log2(args.init_size)) - 2 resolution = 4 * 2 ** step loader = sample_data( dataset, args.batch.get(resolution, args.batch_default), resolution ) data_loader = iter(loader) adjust_lr(g_optimizer, args.lr.get(resolution, 0.0001)) adjust_lr(d_optimizer, args.lr.get(resolution, 0.0004)) pbar = tqdm(range(0, 300_000)) requires_grad(text_process, False) requires_grad(generator, False) requires_grad(discriminator, True) disc_loss_val = 0 gen_loss_val = 0 grad_loss_val = 0 kl_loss_val = 0 score = 0 alpha = 0 used_sample = 0 max_step = int(math.log2(args.max_size)) - 2 final_progress = False # fixed output tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') _, fixed_caption = next(data_loader) fixed_gen_i, fixed_gen_j = (10, 5) fixed_caption = fixed_caption.cuda() # fixed_c_code, _, _, _, _ = text_process(fixed_caption) descriptions = get_sentence(tokenizer,fixed_caption[:fixed_gen_j]) with open(os.path.join(args.out, 'fixed_sentence.txt'), 'w') as f: for item in descriptions: f.write("%s\n" % item) # Training iteration for i in pbar: discriminator.zero_grad() alpha = min(1, 1 / args.phase * (used_sample + 1)) # if (resolution == args.init_size and args.ckpt is None) or final_progress: if (resolution == args.init_size and args.ckpt is None) or final_progress: alpha = 1 if used_sample > args.phase * 2: used_sample = 0 step += 1 if step > max_step: step = max_step final_progress = True ckpt_step = step + 1 else: alpha = 0 ckpt_step = step resolution = 4 * 2 ** step loader = sample_data( dataset, args.batch.get(resolution, args.batch_default), resolution ) data_loader = iter(loader) torch.save( { 'text_process': text_process.module.state_dict(), 'generator': generator.module.state_dict(), 'discriminator': discriminator.module.state_dict(), 't_optimizer': t_optimizer.state_dict(), 'g_optimizer': g_optimizer.state_dict(), 'd_optimizer': d_optimizer.state_dict(), 'g_running': g_running.state_dict(), 't_running': t_running.state_dict(), }, os.path.join(args.out, f'checkpoint/train_step-{ckpt_step}.model') ) adjust_lr(t_optimizer, args.lr.get(resolution, 0.0001)) adjust_lr(g_optimizer, args.lr.get(resolution, 0.0001)) adjust_lr(d_optimizer, args.lr.get(resolution, 0.0004)) try: real_image, caption = next(data_loader) except (OSError, StopIteration): data_loader = iter(loader) real_image, caption = next(data_loader) used_sample += real_image.shape[0] b_size = real_image.size(0) real_image = real_image.cuda() caption = caption.cuda() # c_code, sent_emb, words_embs, mu, log_var = text_process(caption) sent_emb, words_embs = text_process(caption) if args.loss == 'wgan-gp': real_predict = discriminator(real_image, sent_emb, step=step, alpha=alpha) real_predict = real_predict.mean() - 0.001 * (real_predict ** 2).mean() (-real_predict).backward(retain_graph=True) ####Todo: fit for condition gan### elif args.loss == 'r1': real_image.requires_grad = True real_scores = discriminator(real_image, sent_emb, step=step, alpha=alpha) real_predict = F.softplus(-real_scores).mean() real_predict.backward(retain_graph=True) grad_real = grad( outputs=real_scores.sum(), inputs=real_image, create_graph=True )[0] grad_penalty = ( grad_real.view(grad_real.size(0), -1).norm(2, dim=1) ** 2 ).mean() grad_penalty = 10 / 2 * grad_penalty grad_penalty.backward() if (i+1)%10 == 0: grad_loss_val = grad_penalty.item() ####Todo: fit for condition gan### if args.mixing and random.random() < 0.9: gen_in11, gen_in12, gen_in21, gen_in22 = torch.randn( 4, b_size, code_size, device='cuda' ).chunk(4, 0) gen_in1 = [gen_in11.squeeze(0), gen_in12.squeeze(0)] gen_in2 = [gen_in21.squeeze(0), gen_in22.squeeze(0)] else: gen_in1, gen_in2 = torch.randn(2, b_size, 384, device='cuda').chunk( 2, 0 ) gen_in1, gen_in2 = gen_in1.squeeze(0), gen_in2.squeeze(0) gen_in1 = torch.cat([gen_in1, sent_emb], dim=1) gen_in2 = torch.cat([gen_in2, sent_emb], dim=1) fake_image = generator(gen_in1, step=step, alpha=alpha) fake_predict = discriminator(fake_image, sent_emb, step=step, alpha=alpha) #mismatch_predict = discriminator(real_image[:(b_size-1)], sent_emb[1:b_size], step=step, alpha=alpha) if args.loss == 'wgan-gp': fake_predict = fake_predict.mean() # mismatch_predict = mismatch_predict.mean() / 2. # (fake_predict + mismatch_predict).backward(retain_graph=True) (fake_predict).backward(retain_graph=True) eps = torch.rand(b_size, 1, 1, 1).cuda() x_hat = eps * real_image.data + (1 - eps) * fake_image.data x_hat.requires_grad = True hat_predict = discriminator(x_hat, sent_emb, step=step, alpha=alpha) grad_x_hat = grad( outputs=hat_predict.sum(), inputs=x_hat, create_graph=True )[0] grad_penalty = ( (grad_x_hat.view(grad_x_hat.size(0), -1).norm(2, dim=1) - 1) ** 2 ).mean() grad_penalty = 10 * grad_penalty grad_penalty.backward() if (i+1)%10 == 0: grad_loss_val = grad_penalty.item() disc_loss_val = (-real_predict + fake_predict).item() ####Todo: fit for condition gan### elif args.loss == 'r1': fake_predict = F.softplus(fake_predict).mean() fake_predict.backward() if i%10 == 0: disc_loss_val = (real_predict + fake_predict).item() d_optimizer.step() if (i + 1) % n_critic == 0: text_process.zero_grad() generator.zero_grad() requires_grad(text_process.module.bert_embedding.fc, True) requires_grad(generator, True) requires_grad(discriminator, False) fake_image = generator(gen_in2, step=step, alpha=alpha) predict = discriminator(fake_image, sent_emb, step=step, alpha=alpha) if args.loss == 'wgan-gp': loss = (-predict).mean() elif args.loss == 'r1': loss = F.softplus(-predict).mean() # kl_loss = KL_loss(mu, log_var) (loss).backward() if (i+1) % 10 == 0: gen_loss_val = loss.item() # kl_loss_val = kl_loss.item() t_optimizer.step() g_optimizer.step() accumulate(t_running, text_process.module) accumulate(g_running, generator.module) requires_grad(text_process, False) requires_grad(generator, False) requires_grad(discriminator, True) if (i + 1) % 1000 == 0: images = [] with torch.no_grad(): # fixed_c_code, _, _, _, _ = t_running(fixed_caption) sent_emb, _ = t_running(fixed_caption) for _ in range(fixed_gen_i): images.append( g_running( torch.cat([torch.randn(fixed_gen_j, 384).cuda(), sent_emb[:fixed_gen_j]], dim=1), step=step, alpha=alpha ).data.cpu() ) images = torch.cat(images, 0) score = inception_score.cal(images) utils.save_image( images, os.path.join(args.out, f'{str(i+1).zfill(6)}-{4 * 2 ** step}x{4 * 2 ** step}.png'), nrow=fixed_gen_i, normalize=True, range=(-1, 1), ) if (i + 1) % 10000 == 0: torch.save( t_running.state_dict(), os.path.join(args.out, f'checkpoint/t_{str(i + 1).zfill(6)}.model') ) torch.save( g_running.state_dict(), os.path.join(args.out, f'checkpoint/g_{str(i + 1).zfill(6)}.model') ) state_msg = ( f'Size: {4 * 2 ** step}; G: {gen_loss_val:.4f}; D: {disc_loss_val:.4f}; ' f'KL: {kl_loss_val:.4f}; Grad: {grad_loss_val:.4f}; ' f'IS: {score: .4f}; ' f'Alpha: {alpha:.4f};' ) pbar.set_description(state_msg)
'O': O, 'C': C, 'P': P }, ignore_index=True) cur_idx += 1 step += 1 return result if __name__ == '__main__': EP = 100 SAVING_DIR = '../models/' tokenizer = BertTokenizer.from_pretrained( '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch', do_lower_case=True) test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv', None, tokenizer, type='laptop') test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) model = OpinioNet.from_pretrained( '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch') model.load_state_dict(torch.load('../models/saved_best_model_wwm_ext')) model.cuda()
def train_and_eval_model( args, saved_pickle_path=parent_folder_path + "/data_generated/squad_retrieval_data_seed_0_dev_2000.pickle"): # TODO: later think about a way to pass this folder directory in a clever way. N_EPOCH = args.n_epoch BATCH_SIZE_TRAIN = args.batch_size_train BATCH_SIZE_EVAL = args.batch_size_eval NUM_WORKERS = args.n_worker N_NEG_FACT = args.n_neg_sample DEVICE = torch.device( args.device) if torch.cuda.is_available() else torch.device("cpu") # Instantiate BERT retriever, optimizer and tokenizer. bert_retriever = BertSQuADRetriever(N_NEG_FACT, DEVICE, BATCH_SIZE_TRAIN, BATCH_SIZE_EVAL) bert_retriever.to(DEVICE) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') optimizer = optim.Adam(bert_retriever.parameters(), lr=0.00001) now = datetime.datetime.now() date_time = str(now)[:10] + '_' + str(now)[11:13] + str(now)[14:16] if args.dataset == "squad": # Load SQuAD dataset and dataloader. squad_retrieval_data = squad_retrieval.convert_squad_to_retrieval( tokenizer, random_seed=args.seed, num_dev=args.num_dev) squad_retrieval_train_dataset = squad_retrieval.SQuADRetrievalDatasetTrain( instance_list=squad_retrieval_data["train_list"], sent_list=squad_retrieval_data["sent_list"], doc_list=squad_retrieval_data["doc_list"], resp_list=squad_retrieval_data["resp_list"], tokenizer=tokenizer, random_seed=args.seed, n_neg_sample=N_NEG_FACT) retrieval_train_dataloader = DataLoader( squad_retrieval_train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=NUM_WORKERS, collate_fn=squad_retrieval.PadCollateSQuADTrain()) squad_retrieval_dev_dataset = squad_retrieval.SQuADRetrievalDatasetEvalQuery( instance_list=squad_retrieval_data["dev_list"], sent_list=squad_retrieval_data["sent_list"], doc_list=squad_retrieval_data["doc_list"], resp_list=squad_retrieval_data["resp_list"], tokenizer=tokenizer) retrieval_dev_dataloader = DataLoader( squad_retrieval_dev_dataset, batch_size=BATCH_SIZE_EVAL, shuffle=False, num_workers=NUM_WORKERS, collate_fn=squad_retrieval.PadCollateSQuADEvalQuery()) squad_retrieval_test_dataset = squad_retrieval.SQuADRetrievalDatasetEvalQuery( instance_list=squad_retrieval_data["test_list"], sent_list=squad_retrieval_data["sent_list"], doc_list=squad_retrieval_data["doc_list"], resp_list=squad_retrieval_data["resp_list"], tokenizer=tokenizer) retrieval_test_dataloader = DataLoader( squad_retrieval_test_dataset, batch_size=BATCH_SIZE_EVAL, shuffle=False, num_workers=NUM_WORKERS, collate_fn=squad_retrieval.PadCollateSQuADEvalQuery()) squad_retrieval_eval_fact_dataset = squad_retrieval.SQuADRetrievalDatasetEvalFact( instance_list=squad_retrieval_data["resp_list"], sent_list=squad_retrieval_data["sent_list"], doc_list=squad_retrieval_data["doc_list"], resp_list=squad_retrieval_data["resp_list"], tokenizer=tokenizer) retrieval_eval_fact_dataloader = DataLoader( squad_retrieval_eval_fact_dataset, batch_size=BATCH_SIZE_EVAL, shuffle=False, num_workers=NUM_WORKERS, collate_fn=squad_retrieval.PadCollateSQuADEvalFact()) save_folder_path = parent_folder_path + '/data_generated/squad_retrieval_seed_' + str( args.seed) + "_" + date_time + "/" elif args.dataset == "openbook": train_list, dev_list, test_list, kb = openbook_retrieval.construct_retrieval_dataset_openbook( num_neg_sample=N_NEG_FACT, random_seed=args.seed) openbook_retrieval_train_dataset = openbook_retrieval.OpenbookRetrievalDatasetTrain( instance_list=train_list, kb=kb, tokenizer=tokenizer, num_neg_sample=N_NEG_FACT) retrieval_train_dataloader = DataLoader( openbook_retrieval_train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=NUM_WORKERS, collate_fn=openbook_retrieval.PadCollateOpenbookTrain()) openbook_retrieval_dev_dataset = openbook_retrieval.OpenbookRetrievalDatasetEvalQuery( instance_list=dev_list, tokenizer=tokenizer) retrieval_dev_dataloader = DataLoader( openbook_retrieval_dev_dataset, batch_size=BATCH_SIZE_EVAL, shuffle=False, num_workers=NUM_WORKERS, collate_fn=openbook_retrieval.PadCollateOpenbookEvalQuery()) openbook_retrieval_test_dataset = openbook_retrieval.OpenbookRetrievalDatasetEvalQuery( instance_list=test_list, tokenizer=tokenizer) retrieval_test_dataloader = DataLoader( openbook_retrieval_test_dataset, batch_size=BATCH_SIZE_EVAL, shuffle=False, num_workers=NUM_WORKERS, collate_fn=openbook_retrieval.PadCollateOpenbookEvalQuery()) openbook_retrieval_eval_fact_dataset = openbook_retrieval.OpenbookRetrievalDatasetEvalFact( kb=kb, tokenizer=tokenizer) retrieval_eval_fact_dataloader = DataLoader( openbook_retrieval_eval_fact_dataset, batch_size=BATCH_SIZE_EVAL, shuffle=False, num_workers=NUM_WORKERS, collate_fn=openbook_retrieval.PadCollateOpenbookEvalFact()) save_folder_path = parent_folder_path + '/data_generated/openbook_retrieval_seed_' + str( args.seed) + "_" + date_time + "/" else: return 0 if not os.path.exists(save_folder_path): os.makedirs(save_folder_path) # Start evaluation. best_mrr = 0 main_result_array = np.zeros((N_EPOCH, 3)) for epoch in range(N_EPOCH): print("=" * 20) print("Epoch ", epoch + 1) train_loss = bert_retriever.train_epoch(optimizer, retrieval_train_dataloader) dev_result_dict, test_result_dict = bert_retriever.eval_epoch( retrieval_dev_dataloader, retrieval_test_dataloader, retrieval_eval_fact_dataloader) dev_mrr = sum(dev_result_dict["mrr"]) / len(dev_result_dict["mrr"]) test_mrr = sum(test_result_dict["mrr"]) / len(test_result_dict["mrr"]) print("\t\tepoch " + str(epoch + 1) + " training loss:" + str(train_loss) + " dev mrr:" + str(dev_mrr) + " test mrr:" + str(test_mrr)) main_result_array[epoch, :] = [train_loss, dev_mrr, test_mrr] if dev_mrr > best_mrr: best_mrr = dev_mrr torch.save(bert_retriever, save_folder_path + "saved_bert_retriever") with open(save_folder_path + "dev_dict.pickle", "wb") as handle: pickle.dump(dev_result_dict, handle) with open(save_folder_path + "test_dict.pickle", "wb") as handle: pickle.dump(test_result_dict, handle) np.save(save_folder_path + "main_result.npy", main_result_array) return 0
def _Get_Bert_Representation(self): count = 0 bert_map = {} for root, dirs, files in os.walk("./data/test"): for file in files: file_path = os.path.join(root, file) print(file_path) file = open(file_path, "r") while True: line = file.readline() if not line: break line = line[:len(line) - 1] line = line.split(" ") line = line[:len(line) - 1] line = " ".join(line) if (line in bert_map.keys()): continue if (self.bert_tokenizer == None): self.bert_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased') text = "[CLS] " + line + " [SEP]" text = text.replace(" ", " ") tokenized_text = self.bert_tokenizer.tokenize(text) indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids( tokenized_text) segments_ids = [0 for _ in tokenized_text] tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) if (self.bert == None): self.bert = BertModel.from_pretrained( 'bert-base-uncased') self.bert.eval() with torch.no_grad(): representation, sum = [], 0 encoded_layers, _ = self.bert(tokens_tensor, segments_tensors) Len = len(encoded_layers[-1].numpy()[0]) representation = np.zeros(768) for i in range(1, Len - 1): representation += encoded_layers[-1].numpy()[0][i] sum += 1 representation = representation * 1.0 / sum bert_map[line] = representation count += 1 if (count % 100 == 0): print(count) with open("./bert_map", 'wb') as file: pickle.dump(bert_map, file)
def load_data(data_folder, pretrained_model): tokenizer = BertTokenizer.from_pretrained(pretrained_model) with open(data_folder / 'tokenized_docs_bert.pkl', 'rb') as tok_docs_file: docs = pickle.load(tok_docs_file) return docs, tokenizer
###################################################### # Loading Data ###################################################### ddt = DDT() conllu_format = ddt.load_as_conllu() L = [(i, token.form, token.misc.get("name").pop()) for i, sent in enumerate(conllu_format) for token in sent] df = pd.DataFrame(L, columns=['sentence_id', 'words', 'labels']) ###################################################### # to bert tokens ###################################################### sent_str = [sent.text for sent in conllu_format] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sent_str] # Convert tokens to indexes with open("/home/au554730/Desktop/BERT_test/danish_bert_uncased/vocab.txt") as f: vocab = f.read() vocab = vocab.split("\n") vocab_d = {e: i for i, e in enumerate(vocab)} def sentence_to_idx(sent): return [vocab_d.get(token, vocab_d["[UNK]"]) for token in sent] input_ids = [sentence_to_idx(t) for t in tokenized_texts] max_len = 128
def __init__(self, csv_paths, tokenizer): from pytorch_pretrained_bert import BertTokenizer self.csv_paths = csv_paths self.tokenizer = tokenizer self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataroot = opt.dataroot tag_vocab_dir = dataroot + '/vocab.slot' class_vocab_dir = dataroot + '/vocab.intent' train_data_dir = dataroot + '/train' valid_data_dir = dataroot + '/valid' test_data_dir = dataroot + '/test' if not opt.testing: tag_to_idx, idx_to_tag = vocab_reader.read_vocab_file(tag_vocab_dir, bos_eos=False) class_to_idx, idx_to_class = vocab_reader.read_vocab_file(class_vocab_dir, bos_eos=False) else: tag_to_idx, idx_to_tag = vocab_reader.read_vocab_file(opt.read_vocab+'.tag', bos_eos=False, no_pad=True, no_unk=True) class_to_idx, idx_to_class = vocab_reader.read_vocab_file(opt.read_vocab+'.class', bos_eos=False, no_pad=True, no_unk=True) tokenizer = BertTokenizer.from_pretrained(opt.bert_model_name) logger.info("Vocab size: %s %s" % (len(tag_to_idx), len(class_to_idx))) if not opt.testing: vocab_reader.save_vocab(idx_to_tag, os.path.join(exp_path, opt.save_vocab+'.tag')) vocab_reader.save_vocab(idx_to_class, os.path.join(exp_path, opt.save_vocab+'.class')) opt.word_lowercase = False if not opt.testing: train_feats, train_tags, train_class = data_reader.read_seqtag_data_with_class(train_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, lowercase=opt.word_lowercase) valid_feats, valid_tags, valid_class = data_reader.read_seqtag_data_with_class(valid_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase) test_feats, test_tags, test_class = data_reader.read_seqtag_data_with_class(test_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase) else: valid_feats, valid_tags, valid_class = data_reader.read_seqtag_data_with_class(valid_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase) test_feats, test_tags, test_class = data_reader.read_seqtag_data_with_class(test_data_dir, tag_to_idx, class_to_idx, multiClass=opt.multiClass, keep_order=opt.testing, lowercase=opt.word_lowercase)
import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-large-cased') sent = "hi mary" tokens = tokenizer.tokenize(sent) tokens_ids = tokenizer.convert_tokens_to_ids(tokens) print(tokens) #['hi', 'ma', '##ry'] print(tokens_ids) #[20844, 12477, 1616]
def bert_embeddings(sentences, tokenized_contents, output_file=None): # Using bert_tokenizer for checking for sequence wordpeice tokens length > 512 bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') if output_file: f = open(output_file, 'w') # init embedding # init multilingual BERT bert_embedding = TransformerWordEmbeddings('bert-large-uncased') long_sent = False for i, (sent, sent_tokens) in enumerate(zip(sentences, tokenized_contents)): print("Encoding the {}th input sentence for BERT embedding!".format(i)) # getting the length of bert tokenized sentence after wordpeice tokenization if len(bert_tokenizer.tokenize(sent[0])) >= 510: long_sent = True truncated_tokens = sent_tokens[:len(sent_tokens) // 2] sent_tokens = sent_tokens[len(sent_tokens) // 2:] # Using our own tokens (our own tokenization) tokens: List[Token] = [Token(token) for token in sent_tokens] # create an empty sentence sentence = Sentence() # add tokens from our own tokenization sentence.tokens = tokens bert_embedding.embed(sentence) for j, (token, st) in enumerate(zip(sentence, sent_tokens)): if token.text != st: raise ValueError("Invalid token text") if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') if long_sent: # tokenization for the rest of the sentence truncated_tokens: List[Token] = [ Token(token) for token in truncated_tokens ] # Create empty sentence truncated_sentence = Sentence() # add tokens from our own tokenization truncated_sentence.tokens = truncated_tokens bert_embedding.embed(truncated_sentence) for token in truncated_sentence: if output_file: f.write(token.text + " " + " ".join( [str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join( [str(num) for num in token.embedding.tolist()]) + '\n') long_sent = False f.write('\n')
def main(): """ To run this code with default settings and example data, do $ python experiment.py data/example.csv """ if not os.path.exists('output'): os.mkdir('output') if not os.path.exists('output/auxiliary'): os.mkdir('output/auxiliary') ## Argument parsing args = parser.parse_args() if args.factors is not None: args.factors = args.factors.split(",") if len(args.factors) > 2: print( "WARNING: Cannot plot more than 2 factors at a time. Trimming to", args.factors[:2]) args.factors = args.factors[:2] if args.out is not None: if os.path.exists(args.out): pass # if args.no_overwrite: # quit() # if input('Output directory {} already exists. Risk overwriting files? N/y'.format(args.out)) != 'y': # quit() else: os.mkdir(args.out) if args.raw_out is None: args.raw_out = 'output/auxiliary/{}_{}{}{}{}.pkl'.format( os.path.basename(args.data)[:-4], args.method, "_chain" if args.combine == "chain" else "", # cumsum can use same as no '_norm' if args.method == 'attention' and args.normalize_heads else '', ('_' + str(args.n_items)) if args.n_items is not None else '') if args.trees_out is None: args.trees_out = 'output/auxiliary/{}_SPANNING_{}{}{}{}{}{}.pkl'.format( os.path.basename(args.data)[:-4], args.method, '_' + args.combine if args.combine != "no" else "", '_norm' if args.method == 'attention' and args.normalize_heads else '', ('_' + str(args.n_items)) if args.n_items is not None else '', '_' + args.group_merger, '_' + 'transpose' if args.transpose else '', ) if args.pearson_out is None: args.pearson_out = 'output/auxiliary/{}_PEARSON_{}{}{}{}{}{}.pkl'.format( os.path.basename(args.data)[:-4], args.method, '_' + args.combine if args.combine != "no" else "", '_norm' if args.method == 'attention' and args.normalize_heads else '', ('_' + str(args.n_items)) if args.n_items is not None else '', '_' + args.group_merger, '_' + 'transpose' if args.transpose else '', ) ## Do we need to apply BERT (anew)? apply_BERT = True if os.path.exists(args.raw_out): if args.no_overwrite: apply_BERT = False elif input('Raw output file exists. Overwrite? (N/y)') != "y": apply_BERT = False ## Do we need to compute spanning trees (anew)? compute_trees = True if os.path.exists(args.trees_out): if args.no_overwrite: compute_trees = False elif input('Trees output file exists. Overwrite? (N/y)') != "y": compute_trees = False ## Do we need to compute pearson coefficients (anew)? compute_pearson = True if os.path.exists(args.pearson_out): if args.no_overwrite: compute_pearson = False elif input('Pearson output file exists. Overwrite? (N/y)') != "y": compute_pearson = False ## Set up tokenizer, data tokenizer = BertTokenizer.from_pretrained(args.bert, do_lower_case=("uncased" in args.bert)) items, dependency_trees = data_utils.parse_data(args.data, tokenizer, max_items=args.n_items, words_as_groups=True, dependencies=True) ## Store for convenience args.factors = args.factors or items.factors[: 2] # by default use the first two factors from the data ## Now that args.factors is known, finally choose output directory if args.out is None: dirname = 'temp' out_idx = 0 if not os.path.exists("output"): os.mkdir('output') if not os.path.exists("data/auxiliary/"): os.mkdir('data/auxiliary') while any(x.startswith(dirname) for x in os.listdir('output')): out_idx += 1 dirname = 'temp{}'.format(out_idx) dirname += "_{}{}{}{}{}".format( args.method, "-" + args.combine if args.combine != "no" else "", "_normalized" if (args.method == "attention" and args.normalize_heads) else "", '_' + '-x-'.join(args.factors) if len(args.factors) > 0 else '', "_transposed" if args.transpose else "", ) args.out = os.path.join("output", dirname) os.mkdir(args.out) ## Apply BERT or, if available, load results saved from previous run if apply_BERT: data_for_all_items = interface_BERT.apply_bert(items, tokenizer, args) with open(args.raw_out, 'wb') as file: pickle.dump(data_for_all_items, file) print('BERTs raw outputs saved as', args.raw_out) else: with open(args.raw_out, 'rb') as file: print('BERTs raw outputs loaded from', args.raw_out) data_for_all_items = pickle.load(file) n_layers = data_for_all_items[0].shape[0] # for convenience # The list data_for_all_items now contains, for each item, weights (n_layers, n_tokens, n_tokens) ## If some computation needs to be done, we need to process the BERT outputs a bit if compute_trees or compute_pearson: print("Processing the data from BERT...") ## Take cumsum if needed (placed outside the foregoing, to avoid having to save/load separate file for this if args.combine == "cumsum": for i in range(len(data_for_all_items)): data_for_all_items[i] = np.cumsum(data_for_all_items[i], axis=0) ## Take averages over groups of tokens if not args.ignore_groups and not len(items.groups) == 0: data_for_all_items = data_utils.merge_grouped_tokens( items, data_for_all_items, method=args.group_merger) ## Compute balances (though whether they will be plotted depends on args.balance) # (Re)compute balance: how much token influences minus how much is influenced balance_for_all_items = [] for data_for_item in data_for_all_items: balance_for_item = [] for data_for_layer in data_for_item: balance = np.nansum(data_for_layer - data_for_layer.transpose(), axis=1) balance_for_item.append(balance) balance_for_all_items.append(np.stack(balance_for_item)) # At this point we have two lists of numpy arrays: for each item, the weights & balance across layers. ## Store the weights in dataframe together with original data # TODO All of this feels terribly hacky... # First flatten the numpy array per item data_for_all_items = [ data.reshape(-1).tolist() for data in data_for_all_items ] balance_for_all_items = [ data.reshape(-1).tolist() for data in balance_for_all_items ] # And then concatenate them (still per item per layer) data_and_balance_for_all_items = [ array1 + array2 for array1, array2 in zip(data_for_all_items, balance_for_all_items) ] # Concatenate onto original data rows (with each row repeated n_layers times) # original_items_times_nlayers = [a for l in [[i.to_list()] * n_layers for (_, i) in items.iterrows()] for a in l] data_for_dataframe = [ a + b for a, b in zip([i.to_list() for ( _, i) in items.iterrows()], data_and_balance_for_all_items) ] # Multi-column to represent the (flattened) numpy arrays in a structured way multi_columns = pd.MultiIndex.from_tuples( [(c, '', '', '') for c in items.columns] + [('weights', l, g1, g2) for l in range(n_layers) for g1 in items.groups for g2 in items.groups] + [('balance', l, g, '') for l in range(n_layers) for g in items.groups], names=['', 'layer', 'in', 'out']) df = pd.DataFrame(data_for_dataframe, index=items.index, columns=multi_columns) # Dataframe with three sets of columns: columns from original dataframe, weights (as extracted from BERT & grouped), and the balance computed from them ## Apply BERT or, if available, load results saved from previous run if compute_trees: trees_df = analyze_by_spanning_trees(df, items, dependency_trees, n_layers, args) with open(args.trees_out, 'wb') as file: pickle.dump(trees_df, file) print('Trees and scores saved as', args.trees_out) else: with open(args.trees_out, 'rb') as file: print('Trees and scores loaded from', args.trees_out) trees_df = pickle.load(file) plot_tree_scores(trees_df, args) if compute_pearson: pearson_df = analyze_by_pearson_correlation(df, items, dependency_trees, n_layers, args) with open(args.pearson_out, 'wb') as file: pickle.dump(pearson_df, file) print('Pearson coefficients and p-values saved as', args.pearson_out) else: with open(args.pearson_out, 'rb') as file: print('Pearson coefficients and p-values loaded from', args.pearson_out) pearson_df = pickle.load(file) plot_pearson_scores(pearson_df, args)
def _create_examples(self, sents, mode, prefix, embedding_method, pretrained=None): """ sents: list of strings mode: (string) train, dev or test return: examples: a list containing dicts representing each example """ allowed_modes = ['train', 'dev', 'test'] if mode not in allowed_modes: raise ValueError( f'Mode not recognized, try one of {allowed_modes}') # 生成example文件地址 id_sents_pickle_path = os.path.join( config.CACHE_PATH, prefix + '_' + mode + '.pkl', ) print("this is examples train_sents", sents[0]) # 导入或者新建一个example, 生产方法就是调用lang.sents2ids,其实就是利用lang来转换 # 如果导入的是预训练模型,则并不需要转换 if pretrained == None: id_sents = load_or_create(id_sents_pickle_path, self.lang.sents2ids, sents, force_reload=self.force_reload) elif pretrained == True and embedding_method == "bert": tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") id_sents = load_or_create( id_sents_pickle_path, lambda x: [tokenizer.convert_tokens_to_ids(i) for i in x], sents, force_reload=self.force_reload) if embedding_method == "roberta": roberta = torch.hub.load("pytorch/fairseq", "roberta.base") tokenizer = roberta.encode id_sents = load_or_create( id_sents_pickle_path, lambda x: [tokenizer(i) for i in x], sents, force_reload=self.force_reload, ) print("id sents here00", id_sents[0]) chars_pickle_path = os.path.join( config.CACHE_PATH, prefix + '_' + mode + '_chars.pkl', ) char_id_sents = [[0] for i in range(len(id_sents))] # load_or_create(chars_pickle_path, # self.lang.sents2char_ids, # sents, # force_reload=self.force_reload) # FIXME: Assuming all 3 modes will have labels. This might not be the # case for test data <2018-06-29 10:49:29, Jorge Balazs> # 顺便也将label转化为1 0 的形式 labels = open(config.label_dict[self.corpus_name][mode], encoding="utf-8").readlines() labels = [l.rstrip() for l in labels] id_labels = [self.label2id[label] for label in labels] ids = range(len(id_sents)) # 将id,其实第一个应该算是 index, 原句子, 向量化的句子,char向量化的句子, 标签封装起来 examples = zip( ids, sents, id_sents, char_id_sents, id_labels, ) examples = [{ 'id': ex[0], 'raw_sequence': ex[1], 'sequence': ex[2], 'char_sequence': ex[3], 'label': ex[4], } for ex in examples] return examples
def create_dataloader(self): # 读取输入输出 train_comments = self.train_df["comment_text"].astype(str) train_label = self.train_df["target"].values train_type_labels = self.train_df[self.toxicity_type_list].values # 新的 np 任务 train_np_labels = np.zeros((len(self.train_df), 4)) train_np_identity_labels = np.zeros( (len(self.train_df), len(self.identity_list) * 4)) train_df_copy = self.train_df[self.identity_list + ["target"]] for column in self.identity_list + ["target"]: train_df_copy[column] = np.where(train_df_copy[column] > 0.5, True, False) pp_label_bool = train_df_copy["target"] & np.where( train_df_copy[self.identity_list].sum(axis=1) > 0, True, False) np_label_bool = ~train_df_copy["target"] & np.where( train_df_copy[self.identity_list].sum(axis=1) > 0, True, False) pn_label_bool = train_df_copy["target"] & np.where( (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False) nn_label_bool = ~train_df_copy["target"] & np.where( (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False) train_np_labels[:, 0] = np.where(pp_label_bool > 0, 1, 0) train_np_labels[:, 1] = np.where(np_label_bool > 0, 1, 0) train_np_labels[:, 2] = np.where(pn_label_bool > 0, 1, 0) train_np_labels[:, 3] = np.where(nn_label_bool > 0, 1, 0) for i, column in enumerate(self.identity_list): pp_label_bool = train_df_copy["target"] & train_df_copy[column] np_label_bool = ~train_df_copy["target"] & train_df_copy[column] pn_label_bool = train_df_copy["target"] & (~train_df_copy[column]) nn_label_bool = ~train_df_copy["target"] & (~train_df_copy[column]) train_np_identity_labels[:, i * 4 + 0] = np.where( pp_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 1] = np.where( np_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 2] = np.where( pn_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 3] = np.where( nn_label_bool > 0, 1, 0) # 身份原始值 train_identity_values = self.train_df[self.identity_list].fillna( 0.).values # 所有身份原始值之和 train_identity_sum = train_identity_values.sum(axis=1) # 将身份之和限制在1以下(sigmoid) train_identity_sum_label = np.where(train_identity_sum > 1, 1, train_identity_sum) # 身份01值 train_identity_binary = copy.deepcopy( self.train_df[self.identity_list]) for column in self.identity_list: train_identity_binary[column] = np.where( train_identity_binary[column] > 0.5, 1, 0) # 身份01值有一个就算1 train_identity_binary_sum = train_identity_binary.sum(axis=1) train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1, 0) # 所有身份标签 train_identity_type_labels = train_identity_values train_identity_type_binary_lables = train_identity_binary train_identity_sum_label = train_identity_sum_label train_identity_binary_label = train_identity_or_binary # tokenizer 训练 bert_tokenizer = BertTokenizer.from_pretrained(self.bert_model_path, cache_dir=None, do_lower_case=True) train_bert_tokens = self.convert_lines( self.train_df["comment_text"].fillna("DUMMY_VALUE"), self.max_len, bert_tokenizer) # 划分训练集和验证集 valid_tokens = train_bert_tokens[self.train_len:] valid_label = train_label[self.train_len:] valid_type_labels = train_type_labels[self.train_len:] train_tokens = train_bert_tokens[:self.train_len] train_label = train_label[:self.train_len] train_type_labels = train_type_labels[:self.train_len] valid_identity_type_labels = train_identity_type_labels[self. train_len:] train_identity_type_labels = train_identity_type_labels[:self. train_len] valid_identity_type_binary_lables = train_identity_type_binary_lables[ self.train_len:] train_identity_type_binary_lables = train_identity_type_binary_lables[: self . train_len] valid_identity_sum_label = train_identity_sum_label[self.train_len:] train_identity_sum_label = train_identity_sum_label[:self.train_len] valid_identity_binary_label = train_identity_binary_label[self. train_len:] train_identity_binary_label = train_identity_binary_label[:self. train_len] valid_np_labels = train_np_labels[self.train_len:] train_np_labels = train_np_labels[:self.train_len] valid_np_identity_labels = train_np_identity_labels[self.train_len:] train_np_identity_labels = train_np_identity_labels[:self.train_len] # 计算样本权重 target_weight, aux_weight, identity_weight, np_weight, np_identity_weight = self.cal_sample_weights( ) # 将符号化数据转成 tensor train_x_tensor = torch.tensor(train_tokens, dtype=torch.long) valid_x_tensor = torch.tensor(valid_tokens, dtype=torch.long) train_y_tensor = torch.tensor(np.hstack([ train_label[:, np.newaxis], train_type_labels, train_identity_type_labels, train_np_labels ]), dtype=torch.float32) valid_y_tensor = torch.tensor(np.hstack([ valid_label[:, np.newaxis], valid_type_labels, valid_identity_type_labels, valid_np_labels ]), dtype=torch.float32) target_weight_tensor = torch.tensor(target_weight, dtype=torch.float32) aux_weight_tensor = torch.tensor(aux_weight, dtype=torch.float32) identity_weight_tensor = torch.tensor(identity_weight, dtype=torch.float32) np_weight_tensor = torch.tensor(np_weight, dtype=torch.float32) np_identity_weight_tensor = torch.tensor(np_identity_weight, dtype=torch.float32) train_attention_mask_tensor = train_x_tensor > 0 valid_attention_mask_tensor = valid_x_tensor > 0 if torch.cuda.is_available(): train_x_tensor = train_x_tensor.to(self.device) valid_x_tensor = valid_x_tensor.to(self.device) train_y_tensor = train_y_tensor.to(self.device) valid_y_tensor = valid_y_tensor.to(self.device) target_weight_tensor = target_weight_tensor.to(self.device) aux_weight_tensor = aux_weight_tensor.to(self.device) identity_weight_tensor = identity_weight_tensor.to(self.device) train_attention_mask_tensor = train_attention_mask_tensor.to( self.device) valid_attention_mask_tensor = valid_attention_mask_tensor.to( self.device) np_weight_tensor = np_weight_tensor.to(self.device) np_identity_weight_tensor = np_identity_weight_tensor.to( self.device) # 将 tensor 转成 dataset,训练数据和标签一一对应,用 dataloader 加载的时候 dataset[:-1] 是 x,dataset[-1] 是 y train_dataset = data.TensorDataset(train_x_tensor, train_y_tensor, target_weight_tensor, aux_weight_tensor, identity_weight_tensor, train_attention_mask_tensor, np_weight_tensor) valid_dataset = data.TensorDataset(valid_x_tensor, valid_y_tensor, valid_attention_mask_tensor) # 将 dataset 转成 dataloader train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.base_batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=self.base_batch_size, shuffle=False) # 返回训练数据 return train_loader, valid_loader
# Load pre-trained model with masked language model head bert_version = 'bert-large-uncased' model = BertForMaskedLM.from_pretrained(bert_version) # Preprocess text text = pre_text + target_text # Prevent RuntimeError if len(text)>2000: pre_text = '' for i in range(10,20): pre_text += list[i]+' ' text = pre_text + target_text print('After decreasing the sentences... ''\n') tokenizer = BertTokenizer.from_pretrained(bert_version) tokenized_text = tokenizer.tokenize(text) mask_positions = [] for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) # Predict missing words from left to right model.eval() predicted_token = '' for mask_pos in mask_positions: # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # print('tokens_tensor: ''\n',tokens_tensor)
def get(logger=None, args=None): #TODO: 另外生成多一个mask for generation data = {} taskcla = [] # Others f_name = 'asc_random' with open(f_name, 'r') as f_random_seq: random_sep = f_random_seq.readlines()[args.idrandom].split() print('random_sep: ', random_sep) print('domains: ', domains) print('random_sep: ', len(random_sep)) print('domains: ', len(domains)) for t in range(args.ntasks): asc_dataset = asc_datasets[domains.index(random_sep[t])] ae_dataset = ae_datasets[domains.index(random_sep[t])] data[t] = {} if 'Bing' in asc_dataset: data[t]['name'] = asc_dataset data[t]['ncla'] = 2 elif 'XuSemEval' in asc_dataset: data[t]['name'] = asc_dataset data[t]['ncla'] = 3 print('ae_dataset: ', ae_dataset) logger.info("***** Running training *****") #ASC for encoder ==================== processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = ABSATokenizer.from_pretrained(args.bert_model) train_examples = processor.get_train_examples(asc_dataset) train_features = data_utils.convert_examples_to_features_gen( train_examples, label_list, args.max_seq_length, tokenizer, "asc") all_asc_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_asc_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_asc_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_asc_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_tasks = torch.tensor([t for f in train_features], dtype=torch.long) #AE for decoder ==================== processor = data_utils.AeProcessor() label_list = processor.get_labels() tokenizer = ABSATokenizer.from_pretrained(args.bert_model) train_examples = processor.get_train_examples(ae_dataset) train_features = data_utils.convert_examples_to_features_gen( train_examples, label_list, args.max_seq_length, tokenizer, "ae") all_ae_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_ae_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_ae_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_ae_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) #SG (sentence generation) for decoder ==================== processor = data_utils.SgProcessor() label_list = None tokenizer = ABSATokenizer.from_pretrained(args.bert_model) train_examples = processor.get_train_examples(asc_dataset) mask_source_words = args.mask_source_words max_pred = args.max_pred mask_prob = args.mask_prob skipgram_prb = args.skipgram_prb skipgram_size = args.skipgram_size mask_whole_word = args.mask_whole_word vocab_words = list(tokenizer.vocab.keys()) indexer = tokenizer.convert_tokens_to_ids train_features = data_utils.convert_examples_to_features_gen( train_examples, label_list, args.max_seq_length * 2, tokenizer, "sg", mask_source_words=mask_source_words, max_pred=max_pred, mask_prob=mask_prob, skipgram_prb=skipgram_prb, skipgram_size=skipgram_size, mask_whole_word=mask_whole_word, vocab_words=vocab_words, indexer=indexer) #seq2seq task all_sg_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_sg_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_sg_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_sg_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in train_features], dtype=torch.long).squeeze(1) all_sg_masked_pos = torch.tensor( [f.masked_pos for f in train_features], dtype=torch.long).squeeze(1) all_sg_masked_weights = torch.tensor( [f.masked_weights for f in train_features], dtype=torch.long) ae_length = all_ae_input_ids.size(0) while all_ae_input_ids.size(0) < all_sg_input_ids.size(0): rand_id = torch.randint(low=0, high=ae_length, size=(1, )) all_ae_input_ids = torch.cat( [all_ae_input_ids, all_ae_input_ids[rand_id]], 0) all_ae_segment_ids = torch.cat( [all_ae_segment_ids, all_ae_segment_ids[rand_id]], 0) all_ae_input_mask = torch.cat( [all_ae_input_mask, all_ae_input_mask[rand_id]], 0) all_ae_label_ids = torch.cat( [all_ae_label_ids, all_ae_label_ids[rand_id]], 0) #some have sentiment conflict, the ae can be larger than asc asc_length = all_asc_input_ids.size(0) while all_asc_input_ids.size(0) < all_ae_input_ids.size(0): rand_id = torch.randint(low=0, high=asc_length, size=(1, )) all_asc_input_ids = torch.cat( [all_asc_input_ids, all_asc_input_ids[rand_id]], 0) all_asc_segment_ids = torch.cat( [all_asc_segment_ids, all_asc_segment_ids[rand_id]], 0) all_asc_input_mask = torch.cat( [all_asc_input_mask, all_asc_input_mask[rand_id]], 0) all_asc_label_ids = torch.cat( [all_asc_label_ids, all_asc_label_ids[rand_id]], 0) all_sg_input_ids = torch.cat( [all_sg_input_ids, all_sg_input_ids[rand_id]], 0) all_sg_segment_ids = torch.cat( [all_sg_segment_ids, all_sg_segment_ids[rand_id]], 0) all_sg_input_mask = torch.cat( [all_sg_input_mask, all_sg_input_mask[rand_id]], 0) all_sg_masked_lm_labels = torch.cat( [all_sg_masked_lm_labels, all_sg_masked_lm_labels[rand_id]], 0) all_sg_masked_pos = torch.cat( [all_sg_masked_pos, all_sg_masked_pos[rand_id]], 0) all_sg_masked_weights = torch.cat( [all_sg_masked_weights, all_sg_masked_weights[rand_id]], 0) all_tasks = torch.cat([all_tasks, all_tasks[rand_id]], 0) # ae is smaller in size than others. beacuase a sentence can have multiple terms num_train_steps = int( math.ceil(all_asc_input_ids.size(0) / args.train_batch_size)) * args.num_train_epochs # num_train_steps = int(len(train_examples) / args.train_batch_size) * args.num_train_epochs logger.info(" Num asc examples = %d", all_asc_input_ids.size(0)) logger.info(" Num sg examples = %d", all_sg_input_ids.size(0)) logger.info(" Num ae examples = %d", all_ae_input_ids.size(0)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_data = \ TensorDataset(all_asc_input_ids,all_asc_segment_ids, all_asc_input_mask,\ all_sg_input_ids, all_sg_segment_ids, all_sg_input_mask,\ all_sg_masked_lm_labels,all_sg_masked_pos,all_sg_masked_weights,\ all_ae_input_ids, all_ae_segment_ids, all_ae_input_mask,all_ae_label_ids,all_asc_label_ids,all_tasks) data[t]['train'] = train_data data[t]['num_train_steps'] = num_train_steps logger.info("***** Running validations *****") processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model) dev_examples = processor.get_dev_examples(asc_dataset) dev_features = data_utils.convert_examples_to_features_gen( dev_examples, label_list, args.max_seq_length, tokenizer, "asc") all_asc_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_asc_segment_ids = torch.tensor( [f.segment_ids for f in dev_features], dtype=torch.long) all_asc_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_asc_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) all_tasks = torch.tensor([t for f in dev_features], dtype=torch.long) #AE for decoder ==================== processor = data_utils.AeProcessor() label_list = processor.get_labels() tokenizer = ABSATokenizer.from_pretrained(args.bert_model) dev_examples = processor.get_dev_examples(ae_dataset) dev_features = data_utils.convert_examples_to_features_gen( dev_examples, label_list, args.max_seq_length, tokenizer, "ae") all_ae_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_ae_segment_ids = torch.tensor( [f.segment_ids for f in dev_features], dtype=torch.long) all_ae_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_ae_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) #SG (sentence generation) for decoder ==================== processor = data_utils.SgProcessor() label_list = None tokenizer = ABSATokenizer.from_pretrained(args.bert_model) dev_examples = processor.get_dev_examples(asc_dataset) mask_source_words = args.mask_source_words max_pred = args.max_pred mask_prob = args.mask_prob skipgram_prb = args.skipgram_prb skipgram_size = args.skipgram_size mask_whole_word = args.mask_whole_word vocab_words = list(tokenizer.vocab.keys()) indexer = tokenizer.convert_tokens_to_ids dev_features = data_utils.convert_examples_to_features_gen( dev_examples, label_list, args.max_seq_length * 2, tokenizer, "sg", mask_source_words=mask_source_words, max_pred=max_pred, mask_prob=mask_prob, skipgram_prb=skipgram_prb, skipgram_size=skipgram_size, mask_whole_word=mask_whole_word, vocab_words=vocab_words, indexer=indexer) #seq2seq task all_sg_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_sg_segment_ids = torch.tensor( [f.segment_ids for f in dev_features], dtype=torch.long) all_sg_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_sg_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in dev_features], dtype=torch.long).squeeze(1) all_sg_masked_pos = torch.tensor([f.masked_pos for f in dev_features], dtype=torch.long).squeeze(1) all_sg_masked_weights = torch.tensor( [f.masked_weights for f in dev_features], dtype=torch.long) ae_length = all_ae_input_ids.size(0) while all_ae_input_ids.size(0) < all_sg_input_ids.size(0): rand_id = torch.randint(low=0, high=ae_length, size=(1, )) all_ae_input_ids = torch.cat( [all_ae_input_ids, all_ae_input_ids[rand_id]], 0) all_ae_segment_ids = torch.cat( [all_ae_segment_ids, all_ae_segment_ids[rand_id]], 0) all_ae_input_mask = torch.cat( [all_ae_input_mask, all_ae_input_mask[rand_id]], 0) all_ae_label_ids = torch.cat( [all_ae_label_ids, all_ae_label_ids[rand_id]], 0) #some have sentiment conflict, the ae can be larger than asc asc_length = all_asc_input_ids.size(0) while all_asc_input_ids.size(0) < all_ae_input_ids.size(0): rand_id = torch.randint(low=0, high=asc_length, size=(1, )) all_asc_input_ids = torch.cat( [all_asc_input_ids, all_asc_input_ids[rand_id]], 0) all_asc_segment_ids = torch.cat( [all_asc_segment_ids, all_asc_segment_ids[rand_id]], 0) all_asc_input_mask = torch.cat( [all_asc_input_mask, all_asc_input_mask[rand_id]], 0) all_asc_label_ids = torch.cat( [all_asc_label_ids, all_asc_label_ids[rand_id]], 0) all_sg_input_ids = torch.cat( [all_sg_input_ids, all_sg_input_ids[rand_id]], 0) all_sg_segment_ids = torch.cat( [all_sg_segment_ids, all_sg_segment_ids[rand_id]], 0) all_sg_input_mask = torch.cat( [all_sg_input_mask, all_sg_input_mask[rand_id]], 0) all_sg_masked_lm_labels = torch.cat( [all_sg_masked_lm_labels, all_sg_masked_lm_labels[rand_id]], 0) all_sg_masked_pos = torch.cat( [all_sg_masked_pos, all_sg_masked_pos[rand_id]], 0) all_sg_masked_weights = torch.cat( [all_sg_masked_weights, all_sg_masked_weights[rand_id]], 0) all_tasks = torch.cat([all_tasks, all_tasks[rand_id]], 0) logger.info(" Num asc examples = %d", all_asc_input_ids.size(0)) logger.info(" Num sg examples = %d", all_sg_input_ids.size(0)) logger.info(" Num ae examples = %d", all_ae_input_ids.size(0)) valid_data = \ TensorDataset(all_asc_input_ids,all_asc_segment_ids, all_asc_input_mask,\ all_sg_input_ids, all_sg_segment_ids, all_sg_input_mask,\ all_sg_masked_lm_labels,all_sg_masked_pos,all_sg_masked_weights,\ all_ae_input_ids, all_ae_segment_ids, all_ae_input_mask,all_ae_label_ids,all_asc_label_ids,all_tasks) data[t]['valid'] = valid_data logger.info("***** Running evaluation *****") processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model) eval_examples = processor.get_test_examples(asc_dataset) eval_features = data_utils.convert_examples_to_features_gen( eval_examples, label_list, args.max_seq_length, tokenizer, "asc") all_asc_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_asc_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_asc_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_asc_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_tasks = torch.tensor([t for f in eval_features], dtype=torch.long) #AE for decoder ==================== processor = data_utils.AeProcessor() label_list = processor.get_labels() tokenizer = ABSATokenizer.from_pretrained(args.bert_model) eval_examples = processor.get_test_examples(ae_dataset) eval_features = data_utils.convert_examples_to_features_gen( eval_examples, label_list, args.max_seq_length, tokenizer, "ae") all_ae_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_ae_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_ae_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_ae_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) #SG (sentence generation) for decoder ==================== processor = data_utils.SgProcessor() label_list = None tokenizer = ABSATokenizer.from_pretrained(args.bert_model) eval_examples = processor.get_test_examples(asc_dataset) mask_source_words = args.mask_source_words max_pred = args.max_pred mask_prob = args.mask_prob skipgram_prb = args.skipgram_prb skipgram_size = args.skipgram_size mask_whole_word = args.mask_whole_word vocab_words = list(tokenizer.vocab.keys()) indexer = tokenizer.convert_tokens_to_ids eval_features = data_utils.convert_examples_to_features_gen( eval_examples, label_list, args.max_seq_length * 2, tokenizer, "sg", mask_source_words=mask_source_words, max_pred=max_pred, mask_prob=mask_prob, skipgram_prb=skipgram_prb, skipgram_size=skipgram_size, mask_whole_word=mask_whole_word, vocab_words=vocab_words, indexer=indexer) #seq2seq task all_sg_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_sg_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_sg_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_sg_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in eval_features], dtype=torch.long).squeeze(1) all_sg_masked_pos = torch.tensor([f.masked_pos for f in eval_features], dtype=torch.long).squeeze(1) all_sg_masked_weights = torch.tensor( [f.masked_weights for f in eval_features], dtype=torch.long) ae_length = all_ae_input_ids.size(0) while all_ae_input_ids.size(0) < all_sg_input_ids.size(0): rand_id = torch.randint(low=0, high=ae_length, size=(1, )) all_ae_input_ids = torch.cat( [all_ae_input_ids, all_ae_input_ids[rand_id]], 0) all_ae_segment_ids = torch.cat( [all_ae_segment_ids, all_ae_segment_ids[rand_id]], 0) all_ae_input_mask = torch.cat( [all_ae_input_mask, all_ae_input_mask[rand_id]], 0) all_ae_label_ids = torch.cat( [all_ae_label_ids, all_ae_label_ids[rand_id]], 0) #some have sentiment conflict, the ae can be larger than asc asc_length = all_asc_input_ids.size(0) while all_asc_input_ids.size(0) < all_ae_input_ids.size(0): rand_id = torch.randint(low=0, high=asc_length, size=(1, )) all_asc_input_ids = torch.cat( [all_asc_input_ids, all_asc_input_ids[rand_id]], 0) all_asc_segment_ids = torch.cat( [all_asc_segment_ids, all_asc_segment_ids[rand_id]], 0) all_asc_input_mask = torch.cat( [all_asc_input_mask, all_asc_input_mask[rand_id]], 0) all_asc_label_ids = torch.cat( [all_asc_label_ids, all_asc_label_ids[rand_id]], 0) all_sg_input_ids = torch.cat( [all_sg_input_ids, all_sg_input_ids[rand_id]], 0) all_sg_segment_ids = torch.cat( [all_sg_segment_ids, all_sg_segment_ids[rand_id]], 0) all_sg_input_mask = torch.cat( [all_sg_input_mask, all_sg_input_mask[rand_id]], 0) all_sg_masked_lm_labels = torch.cat( [all_sg_masked_lm_labels, all_sg_masked_lm_labels[rand_id]], 0) all_sg_masked_pos = torch.cat( [all_sg_masked_pos, all_sg_masked_pos[rand_id]], 0) all_sg_masked_weights = torch.cat( [all_sg_masked_weights, all_sg_masked_weights[rand_id]], 0) all_tasks = torch.cat([all_tasks, all_tasks[rand_id]], 0) logger.info(" Num asc examples = %d", all_asc_input_ids.size(0)) logger.info(" Num sg examples = %d", all_sg_input_ids.size(0)) logger.info(" Num ae examples = %d", all_ae_input_ids.size(0)) eval_data = \ TensorDataset(all_asc_input_ids,all_asc_segment_ids, all_asc_input_mask,\ all_sg_input_ids, all_sg_segment_ids, all_sg_input_mask,\ all_sg_masked_lm_labels,all_sg_masked_pos,all_sg_masked_weights,\ all_ae_input_ids, all_ae_segment_ids, all_ae_input_mask,all_ae_label_ids,all_asc_label_ids,all_tasks) # Run prediction for full data data[t]['test'] = eval_data taskcla.append((t, int(data[t]['ncla']))) # Others n = 0 for t in data.keys(): n += data[t]['ncla'] data['ncla'] = n return data, taskcla
def bert_tokenizer(self): tokenizer = BertTokenizer.from_pretrained(self.vocab_file, do_lower_case=True) return tokenizer
def Get_Bert_Representation(self, examples_train, examples_test): train_rep_file = "./data/" + pb.dataset + "_train_" + "bert" test_rep_file = "./data/" + pb.dataset + "_test_" + "bert" if (os.path.exists(train_rep_file) == True and os.path.exists(test_rep_file) == True): with open(train_rep_file, 'rb') as file: examples_train_rep = pickle.load(file) for i, example in enumerate(examples_train): example.bert_mat = examples_train_rep[i] with open(test_rep_file, 'rb') as file: examples_test_rep = pickle.load(file) for i, example in enumerate(examples_test): example.bert_mat = examples_test_rep[i] else: examples = [] for example in examples_train: examples.append(example) for example in examples_test: examples.append(example) for i, example in enumerate(examples): if (self.bert_tokenizer == None): self.bert_tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased') text = "[CLS] " + example.fgt_channels[0] + " [SEP]" text = text.replace(" ", " ") tokenized_text = self.bert_tokenizer.tokenize(text) indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids( tokenized_text) segments_ids = [0 for _ in tokenized_text] tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) if (self.bert == None): self.bert = BertModel.from_pretrained('bert-base-uncased') self.bert.eval() with torch.no_grad(): representation, sum = [], 0 encoded_layers, _ = self.bert(tokens_tensor, segments_tensors) a, b = encoded_layers[0].numpy( ).shape[1], encoded_layers[0].numpy().shape[2] representation = np.zeros((a, b)) for layer in encoded_layers: for words in layer.numpy(): representation += words sum += 1 if (sum > 0): representation = representation * 1.0 / sum representation = list(representation) while (len(representation) < pb.fgt_maxlength): representation.append(np.zeros(b)) example.bert_mat = representation[0:pb.fgt_maxlength] print("{:.2%}".format(i * 1.0 / len(examples)))
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel ## 下面可以找到参数的下载链接 # https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py # https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) home = os.getenv('HOME') ################################################################## ## BERT ################################################################## ## BertTokenizer tokenizer = BertTokenizer.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased-vocab.txt') # Load pre-trained model tokenizer (vocabulary) print(tokenizer.max_len) # 1000000000000; 512 for not large print(len(tokenizer.vocab)) # 30522; words print(type(tokenizer.vocab)) # <class 'collections.OrderedDict'> print(tokenizer.vocab.get('hello', 0)) # 7592 print(tokenizer.vocab.get('helloworld', 0)) # 0 print(tokenizer.ids_to_tokens.get(7592, 'hello')) # hello print(tokenizer.ids_to_tokens.get(75920, 'hello')) # hello print(tokenizer.convert_ids_to_tokens([0, 1, 99, 100, 101, 102, 103, 104, 998, 999])) # ['[PAD]', '[unused0]', '[unused98]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[unused99]', '[unused993]', '!'] text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" tokenized_text = tokenizer.tokenize(text) # Tokenized input print(tokenized_text) # ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]'] ## Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 8 tokenized_text[masked_index] = '[MASK]'
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("bert_model", choices=[ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual-cased", "bert-base-multilingual-uncased", "bert-base-chinese" ], help="Variant of pre-trained model.") parser.add_argument( "language_data", nargs="+", type=str, help="Files with data, name of the file is language code.") parser.add_argument("--num-threads", type=int, default=4) parser.add_argument("--limit", type=int, default=10000) args = parser.parse_args() torch.set_num_threads(args.num_threads) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) model = BertModel.from_pretrained(args.bert_model, output_attentions=True, keep_multihead_output=True).to(device) model.eval() languages = [] entropies = [] with torch.no_grad(): for input_file in args.language_data: lng_code = input_file.split("/")[-1][:-4] print(f"Working on {lng_code}") entropies_sums = None sentence_count = 0 for sentence_tensor in text_data_generator(input_file, tokenizer): sentence_count += 1 layer_attentions = model(sentence_tensor.unsqueeze(0))[0] head_count = layer_attentions[0].shape[1] if entropies_sums is None: entropies_sums = np.zeros( len(layer_attentions) * head_count) head_id = 0 for att_matrices in layer_attentions: for matrix in att_matrices.squeeze(0): entropy = -torch.mean( (matrix * torch.log(matrix + 1e-9)).sum(1)) entropies_sums[head_id] += entropy.cpu().numpy() head_id += 1 if sentence_count >= args.limit: break languages.append(lng_code) entropies.append(entropies_sums / sentence_count) for lng, entropy in zip(languages, entropies): formatted_ent = "\t".join([f"{e:.5f}" for e in entropy]) print(f"{lng}\t{formatted_ent}")