def __init__(self, MODEL_NAME): super(Classifier, self).__init__() self.l1 = BertModel.from_pretrained(MODEL_NAME) self.pre_classifier = torch.nn.Linear(768, 768) self.classifier = torch.nn.Linear(768, 2) self.dropout = torch.nn.Dropout(0.3)
def __init__(self, args) -> None: super(SSNDM, self).__init__() self.hidden_size = args.hidden_size self.ext_ff_size = args.ext_ff_size self.ext_head_num = args.ext_head_num self.ext_dropout = args.ext_dropout self.ext_layer_num = args.ext_layer_num self.sect_num = args.sect_num self.max_seg_num = args.max_seg_num self.max_seg_len = args.max_seg_len self.memory_slots = args.memory_slots self.memory_dim = args.memory_dim self.memory_hops = args.memory_hops self.gat_head_num = args.gat_head_num self.per_head_dim = self.hidden_size // self.gat_head_num self.gat_dropout = args.gat_dropout self.bert_version = args.bert_version self.segment_encoder = BertModel.from_pretrained(self.bert_version) self.ext_transformer = TransformerEncoder(self.hidden_size, self.ext_ff_size, self.ext_head_num, self.ext_dropout, self.ext_layer_num) self.initial_memory = nn.Parameter( torch.normal(mean=0, std=1, size=(self.memory_slots, self.memory_dim))) self.section_embeddings = torch.nn.Embedding(self.sect_num, self.hidden_size) self.segment_position_embeddings = torch.nn.Embedding( self.max_seg_num, self.hidden_size) self.mlp_pred = torch.nn.Sequential( collections.OrderedDict([ ('pred_dense_1', torch.nn.Linear(self.hidden_size * 3, 1024)), ('pred_relu_1', torch.nn.ReLU()), ('pred_dense_2', torch.nn.Linear(1024, 512)), ('pred_relu_2', torch.nn.ReLU()), ('pred_dense_3', torch.nn.Linear(512, 1)), ])) self.mlp_structural = torch.nn.Sequential( collections.OrderedDict([('structural_dense_1', torch.nn.Linear(self.hidden_size, self.hidden_size)), ('structural_tanh_1', torch.nn.Tanh())])) self.mlp_gate = torch.nn.Sequential( collections.OrderedDict([ ('gate_dense_1', torch.nn.Linear(768 * 2, 1)), ('gate_sigmodi_1', torch.nn.Sigmoid()), ])) self.mlp_merge = torch.nn.Sequential( collections.OrderedDict([ ('merge_dense_1', torch.nn.Linear(768 * 2, 768)), ('merge_tanh_1', torch.nn.Tanh()), ])) self.mlp_att_q_lst = nn.ModuleList([ torch.nn.Linear(self.hidden_size, self.hidden_size // self.gat_head_num) for i in range(self.gat_head_num) ]) self.mlp_att_k_lst = nn.ModuleList([ torch.nn.Linear(self.hidden_size, self.hidden_size // self.gat_head_num) for i in range(self.gat_head_num) ]) self.mlp_att_v_lst = nn.ModuleList([ torch.nn.Linear(self.hidden_size, self.hidden_size // self.gat_head_num) for i in range(self.gat_head_num) ]) self.attn_graph = MultiHeadAttention(input_dim=self.hidden_size, output_dim=self.hidden_size, dropout=self.gat_dropout) self.sigmoid = nn.Sigmoid()
output = np.swapaxes(output, 0, 1) list_output.append(output) # ====== Construct Cache ====== # temp_cache = {} for i, sent in enumerate(mini_batch): hask_key = hashlib.sha256(sent.encode()).hexdigest() temp_cache[hask_key] = output[i] self.cache.update(temp_cache) idx += mini_batch_size self.count += mini_batch_size output = np.concatenate(list_output, 0) te = time.time() print('encoding with model', len(sentences), 'processed', self.count, 'took', '{:4.1f}'.format(te-ts)) te = time.time() embedding = self.get_multi_head_embedding(output, heads, head_size) return embedding if __name__ == '__main__': model = BertModel.from_pretrained("/home/users/whwodud98/exp/MRPC/bert-base-uncased/last3/0/checkpoint-0") model.cuda() model = torch.nn.DataParallel(model) model.eval() tokenizer = BertTokenizer.from_pretrained("/home/users/whwodud98/exp/MRPC/bert-base-uncased/last3/0/checkpoint-0", do_lower_case=True)
def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def __init__(self, feature_engineer=False): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' ### TODO # 1. 使用torchvision 初始化resnet152模型 # 2. 使用torchvision 初始化 resnext101_32x8d 模型 # 3. 使用torchvision 初始化 wide_resnet101_2 模型 # 4. 加载bert 模型 print("load") self.res_model =torchvision.models.resnet152(pretrained=False) self.res_model.load_state_dict(torch.load(config.root_path + '/model/resnet150/resnet152-b121ed2d.pth')) self.res_model = self.res_model.to(config.device) print("res152 is ok") #self.resnext_model =torchvision.models.resnext101_32x8d(pretrained=True) #self.resnext_model = self.resnext_model.to(config.device) #self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) #self.wide_model = self.wide_model.to(config.device) self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) print("bert is ok") self.ml_data = MLData(debug_mode=True) if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ]
type=int, required=False, help="Number of Epochs") args = parser.parse_args() if args.featurizer == "cls_featurizer": feat = featurizer.cls_featurizer elif args.featurizer == "avg_pooling_featurizer": feat = featurizer.avg_pooling_featurizer else: raise ValueError("Please enter name of existing featurizer") hf_weights_name = 'bert-base-uncased' bert_tokenizer = BertTokenizer.from_pretrained(hf_weights_name) bert_model = BertModel.from_pretrained(hf_weights_name) for param in bert_model.parameters(): param.requires_grad = False train_dataset = PPDBDataset(corpus_path=args.data_path, tokenizer=bert_tokenizer, encoder=bert_model, seq_len=128) # max seq_length is 129, 128 is appropriate length model = logisticRegressionClassifier(2, input_dim=768) train(train_dataset, model, encoder=bert_model, featurizer=feat, epochs=args.epochs, batch_size=8)
def build_embedder(self): """ load bert here """ return BertModel.from_pretrained(self.opt.bert_path)
x = sentence - center return np.linalg.norm(x, 2) data_path = '/home/ubuntu/likun/nlp_data/cluster/li_event.csv' df = pd.read_csv(data_path) test_text_col = 'use' kws_path = '/home/ubuntu/likun/nlp_data/cluster/event_sentiment_kws.csv' kws = pd.read_csv(kws_path) sentiments = ['正向', '中立', '负向'] centers = [kws[s][0].replace(';', ' ') for s in sentiments] bert_model_path = '/home/ubuntu/likun/nlp_pretrained/bert-wwm-ext' tokenizer = BertTokenizer.from_pretrained(bert_model_path, do_lower_case=True) model = BertModel.from_pretrained(bert_model_path) sentence_vectors = [ get_vector(text, tokenizer, model) for text in tqdm.tqdm(df[test_text_col].tolist()) ] center_vectors = [ get_vector(text, tokenizer, model) for text in tqdm.tqdm(centers) ] distances = [] positions = [] for sentence in sentence_vectors: center_dis = [] for center in center_vectors: center_dis.append(distance(sentence, center))
def __init__(self, uncased, temp_dir, finetune=False): super(Bert, self).__init__() self.model = BertModel.from_pretrained( 'huseinzol05/bert-base-bahasa-cased') self.finetune = finetune
def __init__(self, num_labels): super(VanillaBertLayer, self).__init__() self.bert = BertModel.from_pretrained(config.bert_directory, output_hidden_states=True, output_attentions=True, num_labels=num_labels)
num_labels=2) config.update({"return_dict": False}) model = BertForSequenceClassification(config=config) #model = CharacterBertForSequenceClassification(config=config) model.bert = CharacterBertModel.from_pretrained(os.path.join( 'pretrained_models', "general_character_bert"), config=config) else: logging.info('Loading %s model', "general_bert") config = BertConfig.from_pretrained(os.path.join( 'pretrained_models', "bert-base-uncased"), num_labels=2) config.update({"return_dict": False}) model = BertForSequenceClassification(config=config) model.bert = BertModel.from_pretrained(os.path.join( 'pretrained_models', "bert-base-uncased"), config=config) model.to(args.device) if args.freeze_backbone: # freeze all params except classification weights for param_name, param in model.named_parameters(): if not 'classifier' in param_name: param.requires_grad = False for param_name, param in model.named_parameters(): logging.info("Param %s, requires_grad = %s, size: %s" % (param_name, param.requires_grad, param.size())) # train model using custom train/eval loop # global_step, train_loss, best_val_metric, best_val_epoch = train(
import itertools import sys from lxml import etree import numpy as np from scipy.spatial.distance import cosine from cdcrapp import CLIContext from cdcrapp.model import Task, NewsArticle, SciPaper from transformers import BertModel, BertTokenizerFast from spacy.tokens.span import Span torch.cuda.init() tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased').cuda() nlp = spacy.load('en') def extract_mentions(text: str): """Extract named entities and noun phrases""" doc = nlp(text) return list(doc.ents) + list(doc.noun_chunks) def get_tokens_by_offset(start: int, end: int, model_inputs: dict, second_doc=False):
from transformers import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased') model = BertModel.from_pretrained('bert-base-german-cased') text = 'Wie alt bist du?' encoded_input = tokenizer(text, return_tensors='pt') output = model(**encoded_input) print(output.pooler_output.detach().numpy())
import pickle device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Extract SVO triples from corpora flat = itertools.chain.from_iterable BERT_DIM = 768 # Load pre-trained model tokenizer (vocabulary) # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization. bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') bert_model = BertModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True) bert_model.to('cuda') bert_model.eval() def aggregate_by_key(f, xs): d = {} for x in xs: key = f(x) if key in d: d[key].append(x) else: d[key] = [x] return d
type=str2bool, nargs='?', help="Include original sentence in output") parser.add_argument('--threshold', default=0.003, type=float, help="Any attention score lower than this is removed") args = parser.parse_args() use_cuda = args.use_cuda language_model = args.language_model if __name__ == '__main__': tokenizer = AutoTokenizer.from_pretrained(language_model) encoder = BertModel.from_pretrained(language_model) encoder.eval() if use_cuda: encoder = encoder.cuda() input_filename = args.input_filename output_filename = args.output_filename include_sentence = args.include_text_output cached_entity_links = set() with open(input_filename, 'r') as f, open(output_filename, 'w') as g: for idx, line in enumerate(tqdm(f)): sentence = line.strip() if len(sentence): valid_triplets = [] doc = Doc()
def __init__(self): super().__init__() self.bert = BertModel.from_pretrained( "bert-base-chinese", cache_dir="/data1/xul/models/bert/") self.qa_outputs = nn.Linear(768, 2)
def load_model(use_covidbert=False): """Function that loads and returns the CovidBERT model""" # # Load CovidBERT # if use_covidbert: # print("Loading model...") # model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base") # print("Loading tokenizer...") # tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base") # print("Finished loading the model successfully!") #model = SentenceTransformer(model_path) # #Load CovidBERT # if use_covidbert: # print("Loading model...") # model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli") # print("Loading tokenizer...") # print("\n") # tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli") # print("\n") # print("Finished loading the model successfully!") # # Save the model to model path # model_path = os.path.join("models","clinicalcovid") # if not os.path.exists(model_path): # os.makedirs(model_path) # model.save_pretrained(model_path) # tokenizer.save_pretrained(model_path) # model = SentenceTransformer(model_path) # Load CovidBERT if use_covidbert: print("Loading model...") model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli") print("Loading tokenizer...") print("\n") tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli") print("\n") print("Finished loading the model successfully!") # Save the model to model path model_path = os.path.join("models", "gsarticovid") if not os.path.exists(model_path): os.makedirs(model_path) model.save_pretrained(model_path) tokenizer.save_pretrained(model_path) print(f"Successfully saved model to {model_path}") print("Loading Sentence Transformer now!") word_embedding_model = models.BERT( model_path, # max_seq_length=args.max_seq_length, # do_lower_case=args.do_lower_case ) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) rmtree(model_path) model.save(model_path) print("Finished building Sentence Transformer!") # Load regular BERT else: print("Loading BERT") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') print("Finished loading BERT") return model, tokenizer
from data.mybloodyplots import mybloodyplots from transformers import BertModel, BertTokenizer import torch import numpy from sklearn.metrics.pairwise import cosine_similarity from collections import defaultdict import re import os from torch.nn import CosineSimilarity from tqdm import tqdm from scipy.stats import spearmanr import matplotlib import matplotlib._color_data as mcd bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') cos = CosineSimilarity(dim=-1, eps=1e-6) os.makedirs('results', exist_ok=True) datasets = ['men.txt', 'simlex999.txt'] #datasets = ['simlex999.txt'] golden = mcd.CSS4_COLORS['goldenrod'] teal = mcd.CSS4_COLORS['steelblue'] colors = [teal, golden] current_font_folder = '/import/cogsci/andrea/fonts' for dataset in datasets: word_vectors = defaultdict(list)
cross_attentions = [] for i in range(len(self.blocks)): block_out, self_attn, cross_attn = self.blocks[i](out_, attn_mask = attn_masks, encoder_outputs=encoder_outputs, return_self_attn=return_attn, return_cross_attn=return_attn) #print(i, block_out) intermediate_outputs.append(block_out) self_attentions.append(self_attn) cross_attentions.append(cross_attn) out_ = block_out block_out = None return out_, intermediate_outputs if __name__ == "__main__": config = { "vocab_size":30522, "model_dims":768, "segments":2, "blocks":{ "attn_heads":12, "attn_dims":64, "ff_inner_dims":3072 }, "numblocks":12, "dropout":0.1, "layerNormEps":1e-12 } print(config) t = Transformer(config) from transformers import BertModel, BertConfig model = BertModel.from_pretrained("bert-base-uncased") print(model.config)
def __init__(self, n_classes, args=None): super().__init__() self.bert = BertModel.from_pretrained('bert-base-uncased', ) self.fc = nn.Linear(768, n_classes)
def test_model_from_pretrained(self): for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = BertModel.from_pretrained(model_name) self.assertIsNotNone(model)
import torch from transformers import BertModel, BertTokenizer from pk_classifier.utils import preprocess, encode_bert, tokens_emb, sentence_emb import spacy FAKE_TEXTS = [ 'Candida infections have increased due to the growth and expansion of susceptible patient ' 'populations.', 'The clearance (CLz) of midazolam was 3.25l/h', '' ] SPACY_MODEL = spacy.load("en_ner_bc5cdr_md") BERT_TOKENIZER = BertTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed") BERT_MODEL = BertModel.from_pretrained("monologg/biobert_v1.1_pubmed", output_hidden_states=True) def test_preprocess(): docs = list(SPACY_MODEL.pipe(FAKE_TEXTS)) docs_preprocessed = [ preprocess(spacy_obj=doc, ident='', unifier=' ;; ', ngram=1, masking=False) for doc in docs ] assert len(docs_preprocessed) == len(docs) assert docs_preprocessed[ 0] == 'candida ;; infect ;; increas ;; growth ;; expans ;; suscept ;; patient ;; popul' assert docs_preprocessed[1] == 'clearanc ;; clz ;; midazolam ;; ## ;; lh' assert docs_preprocessed[2] == ''
def run(): args = parser.parse_args() data = args.data nlayer = args.nlayer file_path = args.file_path save_path = os.path.join(file_path, 'model_params') MAX_VOCAB_SIZE = 50000 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") with open('aux_files_transfer/dataset_%d.pkl' % MAX_VOCAB_SIZE, 'rb') as f: dataset = pickle.load(f) # skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %MAX_VOCAB_SIZE) embedding_matrix = np.load('aux_files_transfer/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE)) embedding_matrix = torch.tensor(embedding_matrix.T).to(device) # pytorch if data.lower() == 'imdb': data_path = 'aclImdb' bert = BertModel.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') with open('attack_results_final_stop_100_AL_150.pkl', 'rb') as f: results = pickle.load(f) seqs = [] lens = [] tgts = [] for i in range(len(results[1])): if np.array(results[1][i]).shape == (): continue seqs.append(' '.join( [dataset.inv_dict[j] for j in results[1][i].tolist() if j != 0])) lens.append(results[2][i]) tgts.append(results[3][i]) lens = torch.tensor(lens) tgts = torch.tensor(tgts) data_processed = pre_processing(seqs) tokenizer_select = args.tokenizer tokenizer_selection = tokenizer_select if tokenizer_selection.lower() != 'bert': data_processed.processing() train_sequences, test_sequences = data_processed.bert_indx(tokenizer) print('Self preprocessing') else: data_processed.bert_tokenize(tokenizer) test_sequences = data_processed.bert_indx(tokenizer) print(test_sequences[:10]) print('BERT tokenizer') test_text_init = data_processed.numerical(tokenizer, test_sequences) max_len = max([len(s) for s in test_text_init]) test_text = pad_sequences(test_text_init, maxlen=max_len, padding='post') all_test_data = TensorDataset(torch.tensor(test_text), lens, tgts) all_test_loader_bert = DataLoader(all_test_data, batch_size=128, shuffle=True) lstm_size = 128 rnn_state_save = os.path.join(save_path, 'best_bert_0.7_0.001_bert_100') model = bert_lstm( bert, 2, False, nlayer, lstm_size, True, 0.7 ) # batch_size=batch_size, embedding_matrix = embedding_matrix, hidden_size = lstm_size, kept_prob = 0.73, num_layers=2, bidirection=True) model.eval() model.load_state_dict(torch.load(rnn_state_save)) model = model.to(device) model.eval() test_pred = torch.tensor([]) test_targets = torch.tensor([]) with torch.no_grad(): for batch_index, (seqs, length, target) in enumerate(all_test_loader_bert): seqs = seqs.type(torch.LongTensor) len_order = torch.argsort(length, descending=True) length = length[len_order] seqs = seqs[len_order] target = target[len_order] seqs, target, length = seqs.to(device), target.to( device), length.to(device) output, pred_out = model.pred(seqs, length, False) test_pred = torch.cat((test_pred, pred_out.cpu()), dim=0) test_targets = torch.cat( (test_targets, target.type(torch.float).cpu())) accuracy = model.evaluate_accuracy(test_pred.numpy(), test_targets.numpy()) print('Test Accuracy:{:.4f}.'.format(accuracy))
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv') trn_df['is_original'] = 1 # clean texts trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) tst_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
model constant and hyper tuning model hypertuning params """ import torch from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup # Set device for running model, in case of GPU ,it will automatically use GPU. DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # BERT pre trained model, which defined tokens, buffer etc. PRE_TRAINED_MODEL = "bert-base-cased" # Define BERT based tokenizer here. tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL) # Define BERT model bert = BertModel.from_pretrained(PRE_TRAINED_MODEL) # Maximum input tensor length based on sentence. SENTENCE_LENGTH = 128 # Random seed for model RANDOM_SEED = 35 # Training test size 10% training_test_size = 0.1 # Validation test size 50% validation_test_size = 0.5 # Worker for data loader WORKER = 4
from transformers import BertModel, BertTokenizer import logging logging.basicConfig(level=logging.INFO) tokenizer = BertTokenizer.from_pretrained("../wwm/data/aishell_word_table.txt") model = BertModel.from_pretrained("../wwm/data") # 语料 def get_contents(): contents = [] with open('../aishell_label/aishell_word_text.txt', 'r', encoding='utf-8') as fid: for line in fid: parts = line.strip().split(' ') utt_id = parts[0] text = parts[1:] contents.append(text) return contents seq = get_contents() tokens = tokenizer.tokenize(seq) ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)]) # 词典 # 微调
def get_bert(bert_option): model = BertModel.from_pretrained(bert_option) return model
def __init__(self, drop_rate=0.4, output_size=1): super().__init__() self.bert = BertModel.from_pretrained('bert-base-uncased') self.drop = torch.nn.Dropout(drop_rate) self.fc = torch.nn.Linear(768, output_size) # BERTの出力に合わせて768次元を指定
# %% Load library import torch from transformers import BertTokenizer, BertConfig, BertModel # %% Initialize tokenizer tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base", do_lower_case=False) # %% Initialize model pt_model_config = BertConfig.from_pretrained("beomi/kcbert-base") model = BertModel.from_pretrained("beomi/kcbert-base", config=pt_model_config) # %% check configuration print(pt_model_config) # %% create an input value for input sentences = ["안녕하세요", "반갑습니다", "저의 이름은", "로이입니다"] features = tokenizer( sentences, max_length=10, padding="max_length", truncation=True, ) # %% print(features.keys()) # %% print(features['input_ids']) # %% print(features['token_type_ids']) # %%
import nltk # use_cuda = config.use_gpu and torch.cuda.is_available() if torch.cuda.is_available(): device = torch.device( "cuda:0") # you can continue going on here, like cuda:1 cuda:2....etc. print("Running on the GPU") else: device = torch.device("cpu") print("Running on the CPU") from transformers import BertModel, BertTokenizer, BertForQuestionAnswering #Creating instance of BertModel bert_model = BertModel.from_pretrained('bert-base-multilingual-cased') bert_model = bert_model.to(device) #Creating intance of tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') hidden_size = 768 hidden_dim = 50 layer_dim = 1 batch_size = 1 max_tot_len = 500 max_ques_len = 90 stride_len = 90 def prepare_data(para_tokens, ques_tokens, start, end): all_tokens_len = 0