def __init__(self, args) -> None: """Use ELM with fintuned language model for sentiment classification Args: args (dict): contain all the arguments needed. - model_name(str): the name of the transformer model - bsz(int): batch size - epoch: epochs to train - type(str): fintuned type - base: train only ELM - finetune_elm: train transformers with ELM directly - finetune_classifier: train transformers with classifier - finetune_classifier_elm: train transformers with classifier, and use elm replace the classifier - finetune_classifier_beta: train transformers with classifier, and use pinv to calculate beta in classifier - learning_rate(float): learning_rate for finetuning """ # load configuration self.model_name = args.get('model_name', 'bert-base-uncased') self.bsz = args.get('batch_size', 10) self.epoch = args.get('epoch_num', 2) self.learning_rate = args.get('learning_rate', 0.001) self.training_type = args.get('training_type', 'base') self.debug = args.get('debug', True) self.eval_epoch = args.get('eval_epoch', 1) self.lr_decay = args.get('learning_rate_decay', 0.99) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') self.device = device self.n_gpu = torch.cuda.device_count() # load pretrained model if (self.model_name == 'bert-base-uncased') or \ (self.model_name == 'distilbert-base-uncased') or \ (self.model_name == 'albert-base-v2'): self.pretrained_model = AutoModel.from_pretrained(self.model_name) self.pretrained_tokenizer = AutoTokenizer.from_pretrained( self.model_name) input_shape = 768 output_shape = 256 elif (self.model_name == 'prajjwal1/bert-tiny'): self.pretrained_model = AutoModel.from_pretrained(self.model_name) self.pretrained_tokenizer = AutoTokenizer.from_pretrained( self.model_name, model_max_length=512) input_shape = 128 output_shape = 64 elif self.model_name == 'voidful/albert_chinese_xxlarge': self.pretrained_model = AlbertForMaskedLM.from_pretrained( self.model_name) self.pretrained_tokenizer = BertTokenizer.from_pretrained( self.model_name) input_shape = 768 output_shape = 256 else: raise TypeError("Unsupported model name") self.pretrained_model.to(device) device_ids = None if self.n_gpu > 1: device_ids = range(torch.cuda.device_count()) self.pretrained_model = DP(self.pretrained_model, device_ids=device_ids) # load specific model if (self.training_type == 'finetune_classifier') or \ (self.training_type == 'finetune_classifier_elm'): self.classifier = torch.nn.Sequential( torch.nn.Linear(input_shape, 2)) self.loss_func = torch.nn.CrossEntropyLoss() self.classifier.to(device) if self.n_gpu > 1: self.classifier = DP(self.classifier, device_ids=device_ids) if (self.training_type == 'base') or \ (self.training_type =='finetune_classifier_elm'): self.elm = classic_ELM(input_shape, output_shape) if (self.training_type == 'finetune_classifier_linear'): self.elm = classic_ELM(None, None) self.classifier = torch.nn.Sequential( OrderedDict([ ('w', torch.nn.Linear(input_shape, output_shape)), ('act', torch.nn.Sigmoid()), ('beta', torch.nn.Linear(output_shape, 2)), ])) self.loss_func = torch.nn.CrossEntropyLoss() self.classifier.to(device) if self.n_gpu > 1: self.classifier = DP(self.classifier, device_ids=device_ids) # load processor, trainer, evaluator, inferer. processors = { 'base': self.__processor_base__, 'finetune_classifier': self.__processor_base__, 'finetune_classifier_elm': self.__processor_base__, 'finetune_classifier_linear': self.__processor_base__, } trainers = { 'base': self.__train_base__, 'finetune_classifier': self.__train_finetune_classifier__, 'finetune_classifier_elm': self.__train_finetune_classifier_elm__, 'finetune_classifier_linear': self.__train_finetune_classifier_linear__, } evaluators = { 'base': self.__eval_base__, 'finetune_classifier': self.__eval_finetune_classifier__, 'finetune_classifier_elm': self.__eval_base__, 'finetune_classifier_linear': self.__eval_finetune_classifier_linear__, } inferers = { 'base': self.__infer_base__, 'finetune_classifier': self.__infer_finetune_classifier__, 'finetune_classifier_elm': self.__infer_finetune_classifier_elm__, 'finetune_classifier_linear': self.__infer_base__ } self.processor = processors[self.training_type] self.trainer = trainers[self.training_type] self.evaluator = evaluators[self.training_type] self.inferer = inferers[self.training_type]
return el_pres.item() import json import torch import time from BiencoderRanker4 import BiencoderRanker from mention_detection.mention_data_proc_all import ReadTrainDectMent,IterData from mention_detection.utils import * from transformers import BertTokenizer from faiss_indexer import DenseFlatIndexer #torch.cuda.set_device(1) tokenizer=BertTokenizer.from_pretrained('./model/bert-large-uncased') biencoder_params=json.load(open('./model/biencoder/wiki_encoder_large2.json')) with torch.no_grad(): ranker=BiencoderRanker(biencoder_params) ranker.load_state_dict(torch.load('./model/mybiencoder_wiki.bin')) for params in ranker.parameters(): params.requires_grad=False ranker=ranker.to('cpu') trainfile_path='./Data/train.jsonl' train_data=ReadTrainDectMent(trainfile_path,True) train_data.padding() batch_size=32 dataload = IterData(train_data, batch_size, True, True) top_k=1
tmp = [rewards[i]] else: tmp.append(rewards[i]) sent_rewards.append(sum(tmp) / len(tmp)) token_rewards = [] for _ in gpt_mapping: token_rewards.append(sent_rewards[_]) return token_rewards model.load_state_dict(torch.load(args.load_from)) print("loading from {}".format(args.load_from)) model.train() bert_tokenizer = BertTokenizer.from_pretrained(args.modelpath) scorer = BERTGen(bert_tokenizer.vocab_size, args.dim, args.layers, args.head, args.modelpath) scorer.to(args.device) scorer.load_state_dict(torch.load('models/BERT_scorer_ep9.pt')) scorer.eval() optimizer = optim.Adam(model.parameters(), 5e-7) avg_loss = 0 for epoch_idx in range(args.epoch): print("start training {}th epoch".format(epoch_idx)) dataset.shuffle() for idx in range(0, dataset.train_len()): batch = dataset.get_data(idx, details=True) table, sub_columns, title = batch[4:]
logging.getLogger('transformers.tokenization_utils').disabled = True import numpy as np import json import pickle import datetime # import spacy # from allennlp.commands.elmo import ElmoEmbedder torch.cuda.is_available() tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2') model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states=True) model_gpt2.eval() model_gpt2.to('cuda') tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased') model_bert = BertModel.from_pretrained('bert-base-cased') model_bert.eval() model_bert.to('cuda') tokenizer_gpt = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model_gpt = OpenAIGPTModel.from_pretrained('openai-gpt') model_gpt.eval() model_gpt.to('cuda') # weat 1 flowers = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 'rose', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 'peony', 'violet', 'carnation', 'magnolia', 'petunia', 'zinnia','gladiola'] #'gladiola' deleted since it not appear insects = ['ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 'tarantula', 'bee', 'cockroach', 'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 'hornet', 'moth', 'wasp', 'dragonfly', 'horsefly', 'roach', 'weevil','blackfly'] # 'blackfly' deleted for sysmetric since it only appears 1 time.
def get_estimator(max_len=20, epochs=10, batch_size=64, train_steps_per_epoch=None, eval_steps_per_epoch=None, save_dir=tempfile.mkdtemp(), pretrained_model='bert-base-uncased', data_dir=None): # step 1 prepare data train_data, eval_data, data_vocab, label_vocab = mitmovie_ner.load_data( root_dir=data_dir) tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=True) tag2idx = char2idx(label_vocab) pipeline = fe.Pipeline(train_data=train_data, eval_data=eval_data, batch_size=batch_size, ops=[ Tokenize(inputs="x", outputs="x", tokenize_fn=tokenizer.tokenize), WordtoId( inputs="x", outputs="x", mapping=tokenizer.convert_tokens_to_ids), WordtoId(inputs="y", outputs="y", mapping=tag2idx), PadSequence(max_len=max_len, inputs="x", outputs="x"), PadSequence(max_len=max_len, value=len(tag2idx), inputs="y", outputs="y"), AttentionMask(inputs="x", outputs="x_masks") ]) # step 2. prepare model model = fe.build( model_fn=lambda: ner_model(max_len, pretrained_model, label_vocab), optimizer_fn=lambda: tf.optimizers.Adam(1e-5)) network = fe.Network(ops=[ ModelOp(model=model, inputs=["x", "x_masks"], outputs="y_pred"), Reshape(inputs="y", outputs="y", shape=(-1, )), Reshape(inputs="y_pred", outputs="y_pred", shape=(-1, len(label_vocab) + 1)), CrossEntropy(inputs=("y_pred", "y"), outputs="loss"), UpdateOp(model=model, loss_name="loss") ]) traces = [ Accuracy(true_key="y", pred_key="y_pred"), BestModelSaver(model=model, save_dir=save_dir) ] # step 3 prepare estimator estimator = fe.Estimator(network=network, pipeline=pipeline, epochs=epochs, traces=traces, train_steps_per_epoch=train_steps_per_epoch, eval_steps_per_epoch=eval_steps_per_epoch) return estimator
def extractor(): tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) device = torch.device(cuda_num if torch.cuda.is_available() else "cpu") model = BertForTokenClassification.from_pretrained( PRETRAINED_MODEL_NAME, num_labels=len(tag2idx)) model = model.to(device) model.load_state_dict( torch.load(os.path.join(ModelName.format(model_idx), 'pytorch_model.bin'), map_location="cpu")) model.eval() def predict(doc): names, docs = [], [] predset = NameDoc(tokenizer=tokenizer, doc=doc) dataloader = torch.utils.data.DataLoader( predset,batch_size=1,shuffle=True,collate_fn=create_mini_batch) with torch.no_grad(): for tokens, *data in dataloader: if next(model.parameters()).is_cuda: data = [t.to(device) for t in data if t is not None] tokens_tensors, segments_tensors, masks_tensors, labels = data outputs = model(input_ids=tokens_tensors, token_type_ids=None, # token_type_ids=segments_tensors, attention_mask=masks_tensors) logits = outputs[0] logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) logits = logits.detach().cpu().numpy() # Only predict the real word, mark=0, will not calculate masks_tensors = masks_tensors.to('cpu').numpy() tr = lambda e: ','.join(e) if len(e) == 2 and all_english(''.join(e)) else ''.join(e) for i,mask in enumerate(masks_tensors): name, doc = [], '' names.append([]) for j, m in enumerate(mask): if m: if logits[i][j] not in (tag2idx['[CLS]'], tag2idx['[SEP]']): doc += tokens[i][j - 1] if logits[i][j] == tag2idx['B-per']: if name: names[-1].append(tr(name)) name = [] name.append(tokens[i][j - 1]) elif logits[i][j] == tag2idx['I-per']: name.append(tokens[i][j - 1]) elif name: names[-1].append(tr(name)) name = [] else: break if name: names[-1].append(tr(name)) docs.append(doc) # need filter the names from doc and do classification again return names, docs nft = namefilter() def _ext(doc): names, docs = predict(doc) print('original names', names) return nft(list(set().union(*names)), ''.join(docs)) return _ext
parser.add_argument('--dictionary', default=None, type=str, help='dictionary path') args = parser.parse_args() logger = create_logger(config.root_path + '/logs/main.log') if __name__ == '__main__': model_name = args.model x = import_module('models.' + model_name) if model_name in ['bert', 'xlnet', 'roberta']: config.bert_path = config.root_path + '/model/' + model_name + '/' if 'bert' in model_name: config.tokenizer = BertTokenizer.from_pretrained(config.bert_path) elif 'xlnet' in model_name: config.tokenizer = XLNetTokenizer.from_pretrained(config.bert_path) elif 'roberta' in model_name: config.tokenizer = RobertaTokenizer.from_pretrained( config.bert_path) else: raise NotImplementedError config.save_path = config.root_path + 'model/saved_dict/' + model_name + '.ckpt' # 模型训练结果 config.log_path = config.root_path + '/logs/' + model_name config.hidden_size = 768 config.eps = 1e-8 config.gradient_accumulation_steps = 1 config.word = True config.max_length = 400
default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=15, type=int, help="Batch size for predictions.") parser.add_argument("--full_data", type=str, required=True) parser.add_argument('--tokenizer_path', type=str, required=True) args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path) examples = read_examples(full_file=args.full_data) with gzip.open(args.example_output, 'wb') as fout: pickle.dump(examples, fout) features = convert_examples_to_features(examples, tokenizer, max_seq_length=512, max_query_length=50) with gzip.open(args.feature_output, 'wb') as fout: pickle.dump(features, fout)
per_device_eval_batch_size=batch_size_per_gpu, save_steps=-1, evaluate_during_training=True, output_dir=model_path, overwrite_output_dir=another_version, do_train=True, do_eval=True, do_predict=True, learning_rate=learning_rate, num_train_epochs=num_train_epochs, ) set_seed(training_args.seed) # Data Preprocess tokenizer = BertTokenizer.from_pretrained( tokenizer_path, cache_dir=tokenizer_path, ) tokenizer.save_vocabulary(tokenizer_path) tokenizer.save_pretrained(tokenizer_path) train_dataset = (MultipleChoiceDataset( data_dir=data_path, tokenizer=tokenizer, task=task_name, max_seq_length=max_seq_length, overwrite_cache=overwrite_tokenizer, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_path,
datafile = args.datafile data_col = 0 label_col = int(args.label_col) max_len = 100 skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) acc_cum = 0 rec_cum = 0 pre_cum = 0 f1_cum = 0 f1_cum_mic = 0 acc_arr = [] rec_arr = [] pre_arr = [] f1_arr = [] f1_arr_mic = [] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) #------------------------------------------------------------------------------------------------ text_data, labels = prepare_dataset(datafile, data_col, label_col, "word-based") print("Number of Examples: ", len(text_data)) encoder = LabelEncoder() encoder.fit(labels) encoded_labels = encoder.transform(labels) class_weights_labels = class_weight.compute_class_weight('balanced', np.unique(encoded_labels), encoded_labels) num_classes = len(list(encoder.classes_)) print("num_classes: ", num_classes)
def __init__(self, reviews, targets, max_len): self.reviews = reviews self.targets = targets self.tokenizer = BertTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese') self.max_len = max_len
print(len(all_documents)) for document_index in tqdm(range(len(all_documents))): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_len, short_seq_prob, max_ngram, masked_lm_prob, max_predictions_per_seq, vocab_words)) random.shuffle(instances) return instances if __name__ == '__main__': # input_file = './corpus/pro_data.txt' tokenizer = BertTokenizer(vocab_file='./bert_base_pretrain/vocab.txt', do_lower_case=True) max_seq_len = 512 short_seq_prob = 0.3 max_ngram = 3 masked_lm_prob = 0.15 max_predictions_per_seq = 20 file_list = ['./data/train.data'] for i, input_file in enumerate(file_list): print('处理第{}个文件的数据'.format(i)) with open('./data/processed_data{}.json'.format(i), 'w', encoding='utf8') as f: file_examples = create_training_instances( input_file, tokenizer, max_seq_len, short_seq_prob, max_ngram, masked_lm_prob, max_predictions_per_seq)
torch.manual_seed(seed) np.random.seed(seed) # Set device os.environ["CUDA_VISIBLE_DEVICES"] = args['cuda'] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') num_labels = 3 if task == 'c' else 2 # Set tokenizer for different models if model_name == 'bert': if task == 'all': model = MTL_Transformer_LSTM(model_name, model_size, args=args) else: model = BERT(model_size, args=args, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased') elif model_name == 'roberta': if task == 'all': model = MTL_Transformer_LSTM(model_name, model_size, args=args) else: model = RoBERTa(model_size, args=args, num_labels=num_labels) tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}') elif model_name == 'bert-gate' and task == 'all': model_name = model_name.replace('-gate', '') model = GatedModel(model_name, model_size, args=args) tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased') elif model_name == 'roberta-gate' and task == 'all': model_name = model_name.replace('-gate', '') model = GatedModel(model_name, model_size, args=args) tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')
def RBERTQ1_data_preprocessor(input_file, csv_output_file, features_output_file): input_train_data = input_file max_sentence_len = 512 final_data_list = [] sentence_count = 0 sentence_list = [] exception_count = 0 # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') print("Started data preprocessing") with open(input_train_data) as reader_file: for sentence in reader_file: #if sentence_count == 5: #break try: input_data = [] # Split the sentence into tokens with BERT tokenizer splited_text = sentence.split('\t') query_tokenized_text = tokenizer.tokenize(splited_text[1]) tokenized_text = tokenizer.tokenize(splited_text[2]) #print(tokenized_text) sentence_list.append((splited_text[2], splited_text[3])) ent1_pos_st = tokenized_text.index('$') ent1_pos_end = tokenized_text.index('$', ent1_pos_st+1) ent2_pos_st = tokenized_text.index('#') ent2_pos_end = tokenized_text.index('#', ent2_pos_st+1) #print(ent1_pos_st, ent1_pos_end, ent2_pos_st, ent2_pos_end) if len(query_tokenized_text) > max_sentence_len: query_tokenized_text = query_tokenized_text[:max_sentence_len] # If the length of the sentence is more than max length then truncate if len(tokenized_text) > max_sentence_len: tokenized_text = tokenized_text[:max_sentence_len] # If the length of the sentence is more than max length then truncate # Map the token strings to their vocabulary indeces. query_indexed_tokens = tokenizer.convert_tokens_to_ids(query_tokenized_text) # Mark each of the tokens as belonging to sentence "0". query_segments_ids = [0] * len(query_tokenized_text) # Mask the sentence tokens with 1 query_att_mask = [1] * len(query_indexed_tokens) # padding the rest of the sequence length query_padding_len = max_sentence_len - len(query_indexed_tokens) # Add the padded token to the indexed tokens query_indexed_tokens = query_indexed_tokens + [0]*query_padding_len # Mask the padded tokens with 0 query_att_mask = query_att_mask + [0]*query_padding_len # Mark the padded tokens as belonging to sentence "0" query_segments_ids = query_segments_ids + [0]*query_padding_len # Map the token strings to their vocabulary indeces. indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Mark each of the tokens as belonging to sentence "0". segments_ids = [0] * len(tokenized_text) # Mask the sentence tokens with 1 att_mask = [1] * len(indexed_tokens) # padding the rest of the sequence length padding_len = max_sentence_len - len(indexed_tokens) # Add the padded token to the indexed tokens indexed_tokens = indexed_tokens + [0]*padding_len # Mask the padded tokens with 0 att_mask = att_mask + [0]*padding_len # Mark the padded tokens as belonging to sentence "0" segments_ids = segments_ids + [0]*padding_len # Initialize entity masks ent1_mask = [0]*len(att_mask) ent2_mask = [0]*len(att_mask) # Mark the entity masks with 1 in the entity positions for ent1_ind in range(ent1_pos_st+1, ent1_pos_end): ent1_mask[ent1_ind] = 1 #print(ent1_mask) for ent2_ind in range(ent2_pos_st+1, ent2_pos_end): ent2_mask[ent2_ind] = 1 input_data.append(indexed_tokens) input_data.append(segments_ids) input_data.append(att_mask) input_data.append(ent1_mask) input_data.append(ent2_mask) input_data.append(query_indexed_tokens) input_data.append(query_segments_ids) input_data.append(query_att_mask) input_data.append([int(splited_text[3])]) input_data.append([int(splited_text[0])]) input_data.append([splited_text[4]]) final_data_list.append(input_data) sentence_count += 1 print("sentence count : %d " % sentence_count) except ValueError: exception_count += 1 print("exception count : %d " % exception_count) except Exception: exception_count += 1 print("general exception") print("exception count : %d " % exception_count) #print("The sentence count is %d" % sentence_count) # if os.path.exists(features_file): # print('in if') # final_data_list = torch.load(features_output_file) # else: torch.save(final_data_list, features_output_file) writer = csv.writer(open(csv_output_file, 'w')) writer.writerows(final_data_list) # indexed_tokens_tensor = torch.tensor([ind_tokens[0] for ind_tokens in final_data_list]) # segment_ids_tensor = torch.tensor([seg_ids[1] for seg_ids in final_data_list]) # att_mask_tensor = torch.tensor([attn[2] for attn in final_data_list]) # ent1_mask_tensor = torch.tensor([ent1_mask[3] for ent1_mask in final_data_list]) # ent2_mask_tensor = torch.tensor([ent2_mask[4] for ent2_mask in final_data_list]) # query_indexed_tokens_tensor = torch.tensor([q_ind_tokens[5] for q_ind_tokens in final_data_list]) # query_segment_ids_tensor = torch.tensor([q_seg_ids[6] for q_seg_ids in final_data_list]) # query_att_mask_tensor = torch.tensor([q_attn[7] for q_attn in final_data_list]) # labels_tensor = torch.tensor([labels[8] for labels in final_data_list]) # seqid_tensor = torch.tensor([seqid[9] for seqid in final_data_list]) # #print(ent1_mask_tensor.shape) # print("Finished Data Preprocessing") # final_dataset = torch.utils.data.TensorDataset( # indexed_tokens_tensor, # segment_ids_tensor, # att_mask_tensor, # ent1_mask_tensor, # ent2_mask_tensor, # query_indexed_tokens_tensor, # query_segment_ids_tensor, # query_att_mask_tensor, # labels_tensor, # seqid_tensor # ) # return final_dataset
def __init__(self, split_name='validation'): self.splitname = split_name self.sen1, self.sen2, self.label = self.read_tsv(self.splitname) self.tokenizer = BertTokenizer.from_pretrained( scitailConfig.tokenizer_name)
print() print("Number of GPUs: ", n_gpu) print() batch_size = batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) if args.local_rank not in [-1, 0]: torch.distributed.barrier() tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=do_lower_case) model_qa = BertQA.from_pretrained(args.output_dir) model_qa.to(device) dev_features = bert_utils.convert_examples_to_features(dev_InputExamples, MAX_SEQ_LENGTH, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_label_ids for f in dev_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_label_ids for f in dev_features],
docs.append(doc) # need filter the names from doc and do classification again return names, docs nft = namefilter() def _ext(doc): names, docs = predict(doc) print('original names', names) return nft(list(set().union(*names)), ''.join(docs)) return _ext if __name__ == "__main__": tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) device = torch.device(cuda_num if torch.cuda.is_available() else "cpu") model = BertForTokenClassification.from_pretrained( PRETRAINED_MODEL_NAME, num_labels=len(tag2idx)) model = model.to(device) # additional max_grad_norm = 1.0 FULL_FINETUNING = True if FULL_FINETUNING: # Fine tune model all layer parameters param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [
def main(): args = setup_train_args() # 日志同时输出到文件和console global logger logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) # 为CPU设置种子用于生成随机数,以使得结果是确定的 # 为当前GPU设置随机种子;如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。 # 当得到比较好的结果时我们通常希望这个结果是可以复现 if args.seed: set_random_seed(args) # 设置使用哪些显卡进行训练 os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 初始化tokenizer tokenizer = BertTokenizer(vocab_file=args.vocab_path) # tokenizer的字典大小 vocab_size = len(tokenizer) global pad_id pad_id = tokenizer.convert_tokens_to_ids(PAD) # 创建对话模型的输出目录 if not os.path.exists(args.lyric_model_output_path): os.mkdir(args.lyric_model_output_path) # 创建MMI模型的输出目录 if not os.path.exists(args.mmi_model_output_path): os.mkdir(args.mmi_model_output_path) # 加载GPT2模型 model, n_ctx = create_model(args, vocab_size) model.to(device) # 对原始数据进行预处理,将原始语料转换成对应的token_id if args.raw and args.train_mmi: # 如果当前是要训练MMI模型 preprocess_mmi_raw_data(args, tokenizer, n_ctx) elif args.raw and not args.train_mmi: # 如果当前是要训练对话生成模型 preprocess_raw_data(args, tokenizer, n_ctx) # 是否使用多块GPU进行并行运算 multi_gpu = False if args.cuda and torch.cuda.device_count() > 1: logger.info("Let's use GPUs to train") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True # 记录模型参数数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() logger.info('number of model parameters: {}'.format(num_parameters)) # 加载数据 logger.info("loading traing data") if args.train_mmi: # 如果是训练MMI模型 with open(args.train_mmi_tokenized_path, "r", encoding="utf8") as f: data = f.read() else: # 如果是训练生成模型 with open(args.train_tokenized_path, "r", encoding="utf8") as f: data = f.read() data_list = data.split("\n") train_list, test_list = train_test_split(data_list, test_size=0.1, random_state=1) # 开始训练 train(model, device, train_list, multi_gpu, args) # 测试模型 evaluate(model, device, test_list, multi_gpu, args)
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None, output_dir=None, max_seq_length=80, do_train=False, do_eval=False, do_lower_case=False, train_batch_size=24, eval_batch_size=8, learning_rate=2e-5, num_train_epochs=15, warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1, optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""): # ## Required parameters # parser.add_argument("--data_dir", # default=None, # type=str, # required=True, # help="The input data dir. Should contain the .tsv files (or other data files) for the task.") # parser.add_argument("--bert_model", default=None, type=str, required=True, # help="Bert pre-trained model selected in the list: bert-base-uncased, " # "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") # parser.add_argument("--task_name", # default=None, # type=str, # required=True, # help="The name of the task to train.") # parser.add_argument("--output_dir", # default=None, # type=str, # required=True, # help="The output directory where the model checkpoints will be written.") ## Other parameters # parser.add_argument("--max_seq_length", # default=128, # type=int, # help="The maximum total input sequence length after WordPiece tokenization. \n" # "Sequences longer than this will be truncated, and sequences shorter \n" # "than this will be padded.") # parser.add_argument("--do_train", # default=False, # action='store_true', # help="Whether to run training.") # parser.add_argument("--do_eval", # default=False, # action='store_true', # help="Whether to run eval on the dev set.") # parser.add_argument("--do_lower_case", # default=False, # action='store_true', # help="Set this flag if you are using an uncased model.") # parser.add_argument("--train_batch_size", # default=32, # type=int, # help="Total batch size for training.") # parser.add_argument("--eval_batch_size", # default=8, # type=int, # help="Total batch size for eval.") # parser.add_argument("--learning_rate", # default=5e-5, # type=float, # help="The initial learning rate for Adam.") # parser.add_argument("--num_train_epochs", # default=3.0, # type=float, # help="Total number of training epochs to perform.") # parser.add_argument("--warmup_proportion", # default=0.1, # type=float, # help="Proportion of training to perform linear learning rate warmup for. " # "E.g., 0.1 = 10%% of training.") # parser.add_argument("--no_cuda", # default=False, # action='store_true', # help="Whether not to use CUDA when available") # parser.add_argument("--local_rank", # type=int, # default=-1, # help="local_rank for distributed training on gpus") # parser.add_argument('--seed', # type=int, # default=42, # help="random seed for initialization") # parser.add_argument('--gradient_accumulation_steps', # type=int, # default=1, # help="Number of updates steps to accumulate before performing a backward/update pass.") # parser.add_argument('--optimize_on_cpu', # default=False, # action='store_true', # help="Whether to perform optimization and keep the optimizer averages on CPU") # parser.add_argument('--fp16', # default=False, # action='store_true', # help="Whether to use 16-bit float precision instead of 32-bit") # parser.add_argument('--loss_scale', # type=float, default=128, # help='Loss scaling, positive power of 2 values can improve fp16 convergence.') # args = parser.parse_args() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, "mrpc": MrpcProcessor, "stance":StanceProcessor, "neg":NegProcessor, "tri": TriProcessor } if local_rank == -1 or no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if fp16: logger.info("16-bits training currently not supported in distributed training") fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1)) if gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( gradient_accumulation_steps)) train_batch_size = int(train_batch_size / gradient_accumulation_steps) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) if not do_train and not do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if do_train: # if os.path.exists(output_dir) and os.listdir(output_dir): if os.path.exists(output_dir): pass # raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir)) else: os.makedirs(output_dir, exist_ok=True) task_name = task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() # tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_examples = None num_train_steps = None if do_train: train_df = processor.get_train_df(data_dir) test_df = processor.get_test_df(data_dir) dev_df = processor.get_dev_df(data_dir) new_train_df = generate_opp_pers_dataset_not_elim(train_df) new_train_df.to_csv(os.path.join(data_dir, "tri_train.tsv"),sep='\t',index=False) new_test_df = generate_opp_pers_dataset_not_elim(test_df) new_test_df.to_csv(os.path.join(data_dir, "tri_test.tsv"),sep='\t',index=False) new_dev_df = generate_opp_pers_dataset_not_elim(dev_df) new_dev_df.to_csv(os.path.join(data_dir, "tri_dev.tsv"),sep='\t',index=False) train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2) model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2) model.to(device) if fp16: model.half() if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in model.named_parameters()] elif optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] t_total = num_train_steps # print(t_total) if local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) global_step = 0 if do_train: claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("claims features done") train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("perspective features done") # opposite_claim_features = convert_opp_claims_to_features(train_examples, label_list, max_seq_length, tokenizer) # logger.info("opposite claim features done") opposite_perspective_features = convert_triopp_pers_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("opp perspective features done") logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) pers_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) pers_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) pers_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) pers_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long) claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long) claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long) claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long) opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features], dtype=torch.long) opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features], dtype=torch.long) opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features], dtype=torch.long) opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features], dtype=torch.long) # opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features if f.input_ids], dtype=torch.long) # opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features if f.input_mask], dtype=torch.long) # opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features if f.segment_ids], dtype=torch.long) # opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features if f.label_id], dtype=torch.long) # opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long) # opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long) # logger.info(" opp pers id: %d, opp pers mask: %d, opp pers seg: %d, opp pers label: %d, opp calims label: %d, calims label: %d ", len(opp_pers_input_ids),len(opp_pers_input_mask),len(opp_pers_segment_ids),len(opp_pers_label_ids),len(opp_claims_label_ids),len(claims_label_ids)) train_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids) if local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) model.train() for _ in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 process_bar = tqdm(train_dataloader) for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids = batch out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids) # loss = model(input_ids, segment_ids, input_mask, label_ids) # print("out_results:") # print(out_results) loss = out_results if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if fp16 and loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * loss_scale if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps process_bar.set_description("Loss: %0.8f" % (loss.sum().item())) loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: if fp16 or optimize_on_cpu: if fp16 and loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") loss_scale = loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 print("\nLoss: {}\n".format(tr_loss / nb_tr_steps)) torch.save(model.state_dict(), output_dir +"distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15.pth") if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0): train_df = processor.get_train_df(data_dir) test_df = processor.get_test_df(data_dir) dev_df = processor.get_dev_df(data_dir) new_train_df = generate_opp_pers_dataset_not_elim(train_df) new_train_df.to_csv(os.path.join(data_dir, "tri_train.tsv"),sep='\t',index=False) new_test_df = generate_opp_pers_dataset_not_elim(test_df) new_test_df.to_csv(os.path.join(data_dir, "tri_test.tsv"),sep='\t',index=False) new_dev_df = generate_opp_pers_dataset_not_elim(dev_df) new_dev_df.to_csv(os.path.join(data_dir, "tri_dev.tsv"),sep='\t',index=False) # test_df = processor.get_test_df(data_dir) # new_test_df = generate_opp_dataset(test_df) # new_test_df.to_csv(os.path.join(data_dir, "new_test.tsv"),sep='\t',index=False) # train_df = processor.get_train_df(data_dir) # new_train_df = generate_opp_dataset(train_df) # new_train_df.to_csv(os.path.join(data_dir, "new_train.tsv"),sep='\t',index=False) # dev_df = processor.get_dev_df(data_dir) # new_dev_df = generate_opp_dataset(dev_df) # new_dev_df.to_csv(os.path.join(data_dir, "new_dev.tsv"),sep='\t',index=False) eval_examples = processor.get_test_examples(data_dir) # eval_examples = processor.get_train_examples(data_dir) # eval_examples = processor.get_dev_examples(data_dir) claim_features = convert_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer) eval_features = convert_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer) # opposite_claim_features = convert_opp_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer) opposite_eval_features = convert_triopp_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) pers_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) pers_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) pers_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) pers_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long) claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long) claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long) claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long) opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_eval_features], dtype=torch.long) opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_eval_features], dtype=torch.long) opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_eval_features], dtype=torch.long) opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_eval_features], dtype=torch.long) # opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long) # opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long) # logger.info("%d%d%d%d", len(pers_input_ids),len(claims_input_ids),len(opp_pers_input_ids),len(opp_claims_input_ids)) eval_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids) # logger.info(eval_data) # Run prediction for full data # eval_sampler = SequentialSampler(eval_data) eval_sampler = SequentialSampler(eval_data) # logger.info("1") eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) # print('all_input_ids:') # print(all_input_ids) # logger.info("2") # model.load_state_dict(torch.load(saved_model)) model_state_dict = torch.load(saved_model) # logger.info("3") model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2, state_dict=model_state_dict) # logger.info("4") model.to(device) # logger.info("5") model.eval() # logger.info("6") # eval_loss, eval_accuracy = 0, 0 eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0 distance_eval_tp, distance_eval_pred_c, distance_eval_gold_c = 0, 0, 0 eval_loss, eval_accuracy, eval_macro_p, eval_macro_r = 0, 0, 0, 0 distance_accuracy, distance_eval_macro_p, distance_eval_macro_r = 0, 0, 0 raw_score = [] predicted_labels = [] distance_labels = [] predicted_prob = [] gold_labels = [] nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) claim_input_ids = claim_input_ids.to(device) claim_input_mask = claim_input_mask.to(device) claim_segment_ids = claim_segment_ids.to(device) claim_label_ids = claim_label_ids.to(device) opp_input_ids = opp_input_ids.to(device) opp_input_mask = opp_input_mask.to(device) opp_segment_ids = opp_segment_ids.to(device) opp_label_ids = opp_label_ids.to(device) # opp_claim_input_ids = opp_claim_input_ids.to(device) # opp_claim_input_mask = opp_claim_input_mask.to(device) # opp_claim_segment_ids = opp_claim_segment_ids.to(device) # opp_claim_label_ids = opp_claim_label_ids.to(device) # print("start") # print(input_ids) # print(input_mask) # print(segment_ids) # print(label_ids) # print(claim_input_ids) # print(claim_input_mask) # print(claim_segment_ids) # print(claim_label_ids) # print("end") with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[0] distance_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[1] # predicted_prob.extend(torch.nn.functional.softmax(logits, dim=1)) # logits_grid = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask) # print(logits) # print(logits[0]) logits = logits.detach().cpu().numpy() distance_logits = distance_logits.detach().cpu().numpy() # print(logits) label_ids = label_ids.to('cpu').numpy() # print(label_ids) tmp_eval_accuracy = accuracy(logits, label_ids) distance_eval_accuracy = accuracy(distance_logits, label_ids) tmp_predicted = np.argmax(logits, axis=1) distance_predicted = np.argmax(distance_logits, axis=1) predicted_labels.extend(tmp_predicted.tolist()) distance_labels.extend(distance_predicted.tolist()) gold_labels.extend(label_ids.tolist()) # Micro F1 (aggregated tp, fp, fn counts across all examples) tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids) eval_tp += tmp_tp eval_pred_c += tmp_pred_c eval_gold_c += tmp_gold_c distance_tp, distance_pred_c, distance_gold_c = tp_pcount_gcount(distance_logits, label_ids) distance_eval_tp += distance_tp distance_eval_pred_c += distance_pred_c distance_eval_gold_c += distance_gold_c pred_label = np.argmax(logits, axis=1) distance_label = np.argmax(distance_logits, axis=1) raw_score += zip(logits, distance_logits, pred_label, distance_label, label_ids) # Macro F1 (averaged P, R across mini batches) tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids) eval_macro_p += tmp_eval_p eval_macro_r += tmp_eval_r distance_eval_p, distance_eval_r, distance_eval_f1 = p_r_f1(distance_logits, label_ids) distance_eval_macro_p += distance_eval_p distance_eval_macro_r += distance_eval_r eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy distance_accuracy += distance_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 # Micro F1 (aggregated tp, fp, fn counts across all examples) eval_micro_p = eval_tp / eval_pred_c eval_micro_r = eval_tp / eval_gold_c eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r) distance_eval_micro_p = distance_eval_tp / distance_eval_pred_c distance_eval_micro_r = distance_eval_tp / distance_eval_gold_c distance_eval_micro_f1 = 2 * distance_eval_micro_p * distance_eval_micro_r / (distance_eval_micro_p + distance_eval_micro_r) # Macro F1 (averaged P, R across mini batches) eval_macro_p = eval_macro_p / nb_eval_steps eval_macro_r = eval_macro_r / nb_eval_steps eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r) distance_eval_macro_p = distance_eval_macro_p / nb_eval_steps distance_eval_macro_r = distance_eval_macro_r / nb_eval_steps distance_eval_macro_f1 = 2 * distance_eval_macro_p * distance_eval_macro_r / (distance_eval_macro_p + distance_eval_macro_r) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples distance_accuracy = distance_accuracy / nb_eval_examples # print("\nLoss: {}\n".format(eval_loss / nb_eval_steps)) result = { 'eval_loss': eval_loss, 'eval_accuracy':eval_accuracy, 'eval_micro_p': eval_micro_p, 'eval_micro_r': eval_micro_r, 'eval_micro_f1': eval_micro_f1, 'eval_macro_p': eval_macro_p, 'eval_macro_r': eval_macro_r, 'eval_macro_f1': eval_macro_f1, 'distance_accuracy':distance_accuracy, 'distance_eval_micro_p': distance_eval_micro_p, 'distance_eval_micro_r': distance_eval_micro_r, 'distance_eval_micro_f1': distance_eval_micro_f1, 'distance_eval_macro_p': distance_eval_macro_p, 'distance_eval_macro_r': distance_eval_macro_r, 'distance_eval_macro_f1': distance_eval_macro_f1 # 'global_step': global_step, # 'loss': tr_loss/nb_tr_steps } output_eval_file = os.path.join(output_dir, "elim_opp_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15_eval_results.txt") output_raw_score = os.path.join(output_dir, "elim_opp_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15_raw_score.csv") # logger.info(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4)) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # writer.write(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4)) with open(output_raw_score, 'w') as fout: fields = ["undermine_score", "support_score", "cp_distance", "cop_distance", "predict_label", "distance_label", "gold"] writer = csv.DictWriter(fout, fieldnames=fields) writer.writeheader() for score, distance, pred, distance_pred, gold in raw_score: writer.writerow({ "undermine_score": str(score[0]), "support_score": str(score[1]), "cp_distance": str(distance[0]), "cop_distance": str(distance[1]), "predict_label": str(pred), "distance_label": str(distance_pred), "gold": str(gold) })
values #makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter ) df['data_type'] = ['not_set'] * df.shape[0] df.loc[X_train, 'data_type'] = 'train' df.loc[X_val, 'data_type'] = 'val' df.groupby(['category', 'label', 'data_type']).count() #This help in segregating the data inside train and val based on labels from transformers import BertTokenizer from torch.utils.data import TensorDataset #Tokenizer takes raw test and stores it as a token #token = some random number to identified with tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) #special tokens represent where the sentence ends #Attention_mask helps in identifying the empty spaces in the sentence #since sentences can vary in sizes therefore we pad them them to make the dimensionality constant across all elements encoded_data_train = tokenizer.batch_encode_plus( df[df.data_type == 'train'].text.values, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, max_length=256, return_tensors='pt') encoded_data_val = tokenizer.batch_encode_plus( df[df.data_type == 'val'].text.values, add_special_tokens=True,
def main(): random.seed(123) np.random.seed(123) torch.manual_seed(123) parser = argparse.ArgumentParser() parser.add_argument('--cuda', default=None, type=int, required=True, help='Selected CUDA.') parser.add_argument('--batch_size', default=None, type=int, required=True, help='Batch size.') args = parser.parse_args() models = [ ('pfx', 1), ('pfx', 2), ('pfx', 4), ('pfx', 8), ('pfx', 16), ('pfx', 32), ('pfx', 64) ] for m in models: print('Mode: {}'.format(m[0])) print('Count: {}'.format(m[1])) print('Batch size: {}'.format(args.batch_size)) # Define path to data inpath = str(Path('../../data/final').resolve()) test_path = '{}{}sents_{:02d}_test.txt'.format(inpath, os.sep, m[1]) # Initialize val loader print('Load validation data...') try: test_data = AffixDataset(test_path, m[0]) except FileNotFoundError: print('Bin not found.') continue test_loader = DataLoader(test_data, batch_size=args.batch_size, collate_fn=collate_sents) tok = BertTokenizer.from_pretrained('bert-base-uncased') # Define device device = torch.device('cuda:{}'.format(args.cuda) if torch.cuda.is_available() else 'cpu') # Initialize model affix_predictor = AffixPredictor(m[0], freeze=False) # Move model to CUDA affix_predictor = affix_predictor.to(device) mrr_micro, mrr_macro_dict = test_single(test_loader, affix_predictor, m[0], args.cuda) with open('results_final/results_hyp_macro.txt', 'a+') as f: f.write('{:.3f} & '.format(np.mean(list(mrr_macro_dict.values())))) with open('results_final/results_hyp_micro.txt', 'a+') as f: f.write('{:.3f} & '.format(mrr_micro))
def __init__(self): self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') self.model = TFBertModel.from_pretrained('bert-base-multilingual-cased') self.build_model()
#LEARNING_RATE_MODEL = 1e-5 #LEARNING_RATE_CLASSIFIER = 1e-3 # MAX_GRAD_NORM = 1.0 EARLY_STOPPING_ROUNDS = 2 NUM_MODELS = 3 MODEL_PATH = "models/bert_{}".format(time.strftime('%Y%m%d%H%M')) os.mkdir(MODEL_PATH) train = pd.read_csv('data/train_preprocessed.csv') test = pd.read_csv('data/test_preprocessed.csv') train['comment_text'].fillna("", inplace=True) test['comment_text'].fillna("", inplace=True) classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] X_train_valid_raw, y_train_valid = train['comment_text'].str.lower(), train[classes].values X_test_raw = test['comment_text'].str.lower() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') X_train_valid = np.array(list(map(lambda x: tokenizer.encode(x, max_length=MAX_LEN, pad_to_max_length=True), X_train_valid_raw))) X_test = np.array(list(map(lambda x: tokenizer.encode(x, max_length=MAX_LEN, pad_to_max_length=True), X_test_raw))) class BertForSequenceClassification(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights() def forward(
def main(): tokenizer = BertTokenizer.from_pretrained('bert-chinese') train_sent_pair, train_label = read_data('./data/train/train.csv') train_data = { 'sent': [], 'sent_mask':[], 'sent_segment':[] } for q1, q2 in train_sent_pair: sent = '[CLS]' + q1 + '[SEP]' + q2 + '[SEP]' token_list = tokenizer.tokenize(sent) for i, word in enumerate(token_list): if word == '[SEP]': q1_len = i + 1 break sent_id = tokenizer.convert_tokens_to_ids(token_list) padding_id = [0] * (MAX_Q_LENGTH - len(token_list)) train_data['sent'].append(sent_id + padding_id) train_data['sent_segment'].append([1] * (q1_len) + [0] * (MAX_Q_LENGTH - q1_len)) train_data['sent_mask'].append([1] * len(token_list) + padding_id) t_seqs = torch.tensor(train_data['sent'], dtype=torch.long) t_seq_segs = torch.tensor(train_data['sent_segment'], dtype=torch.long) t_seq_masks = torch.tensor(train_data['sent_mask'], dtype=torch.long) t_labels = torch.tensor(train_label, dtype=torch.long) dataset = TensorDataset(t_seqs, t_seq_masks, t_seq_segs, t_labels) dataloader = DataLoader(dataset, shuffle=True, batch_size=32) device = "cpu" # 'cuda:0' model = BertForSequenceClassification.from_pretrained('bert-chinese', num_labels=2) model.to(device) model.train() param_optimizer = list(model.named_parameters()) #print(param_optimizer) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 } ] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-05) for i in range(10): for step, batch_data in enumerate( dataloader): batch_data = tuple(t.to(device) for t in batch_data) batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_data logits = model( batch_seqs, batch_seq_masks, batch_seq_segments, labels=None) loss_function = CrossEntropyLoss() loss = loss_function(logits[0], batch_labels) optimizer.zero_grad() loss.backward() print("epoch{}, step {}, loss = {}".format(i, step, loss.item())) optimizer.step() dev_sent_pair, dev_label = read_data('./dev.csv') dev_data = { 'sent': [], 'sent_mask': [], 'sent_segment': [] } for q1, q2 in dev_sent_pair: sent = '[CLS]' + q1 + '[SEP]' + q2 + '[SEP]' token_list = tokenizer.tokenize(sent) for i, word in enumerate(token_list): if word == '[SEP]': q1_len = i + 1 break sent_id = tokenizer.convert_tokens_to_ids(token_list) # print(len(token_list) == len(sent_id)) padding_id = [0] * (MAX_Q_LENGTH - len(token_list)) dev_data['sent'].append(sent_id + padding_id) dev_data['sent_segment'].append([1] * (q1_len) + [0] * (MAX_Q_LENGTH - q1_len)) dev_data['sent_mask'].append([1] * len(token_list) + padding_id) t_seqs = torch.tensor(dev_data['sent'], dtype=torch.long) t_seq_segs = torch.tensor(dev_data['sent_segment'], dtype=torch.long) t_seq_masks = torch.tensor(dev_data['sent_mask'], dtype=torch.long) t_labels = torch.tensor(dev_label, dtype=torch.long) dataset = TensorDataset(t_seqs, t_seq_masks, t_seq_segs, t_labels) dataloader = DataLoader(dataset, shuffle=True, batch_size=32) true_labels = [] pred_labels = [] model.eval() with torch.no_grad(): for batch_data in dataloader: batch_data = tuple(t.to(device) for t in batch_data) batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_data logits = model( batch_seqs, batch_seq_masks, batch_seq_segments, labels=None) logits = logits[0].argmax(dim=1) pred_labels += logits.cpu().numpy().tolist() true_labels += batch_labels.cpu().numpy().tolist() acc_cnt = 0 for l_pre, l_true in zip(pred_labels, true_labels): if l_pre == l_true: acc_cnt += 1 print('valid acc: {}'.format(acc_cnt / len(pred_labels)))
for i in range(0, len(test_ids_labels)): tem_list = test_ids_labels[i].strip().split('\t') tem_id = tem_list[0].split('/')[-1].replace('.jpg', '') test_ids.append(tem_id) test_labels.append(int(tem_list[1])) with open( '/data/scratch/projects/punim0478/ailis/Fakeddit/fakenews_full/text/' + tem_id + '.txt') as f: data = f.readline().strip() test_text.append(data) logger.info('train: ' + str(len(train_ids)) + ' dev: ' + str(len(dev_ids)) + ' test: ' + str(len(test_ids))) tokenizer = BertTokenizer.from_pretrained(parser.bert_pretrained) train_dataset = TextDataset(train_ids, train_text, train_labels, tokenizer) dev_dataset = TextDataset(dev_ids, dev_text, dev_labels, tokenizer) test_dataset = TextDataset(test_ids, test_text, test_labels, tokenizer) dataloaders_train = torch.utils.data.DataLoader( train_dataset, batch_size=parser.batch_size, shuffle=True, num_workers=4) dataloaders_dev = torch.utils.data.DataLoader(dev_dataset, batch_size=parser.batch_size, shuffle=False, num_workers=4) dataloaders_test = torch.utils.data.DataLoader(
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--mode', choices=['train', 'validate', 'predict'], default='train') arg('--run_root', default='.') arg('--batch-size', type=int, default=16) arg('--step', type=int, default=1) arg('--workers', type=int, default=0) arg('--lr', type=float, default=0.00003) arg('--adam_epsilon', type=float, default=1e-8) arg('--weight_decay', type=float, default=0.0) arg('--fold', type=int, default=0) arg('--warmup', type=float, default=0.05) arg('--limit', type=int) arg('--patience', type=int, default=1) arg('--clean', action='store_true') arg('--n-epochs', type=int, default=20) arg('--vocab-size', type=int, default=13318) arg('--multi-gpu', type=int, default=0) arg('--print-num', type=int, default=5) arg('--temperature', type=float) args = parser.parse_args() df = pd.read_table('../data/dialog-rewrite/corpus.txt', sep="\t\t", names=['a', 'b', 'current', 'label'], dtype=str) df.dropna(how='any', inplace=True) train_length = int(len(df) * 0.9) train_df = df.iloc[:train_length].iloc[:, :] valid_df = df.iloc[train_length:] print(valid_df.head()) if args.mode == 'predict': # valid_df['current'] = valid_df['label'] valid_df = pd.read_table('../data/dialog-rewrite/test.csv', sep=",", names=['a', 'b', 'current', 'label'], dtype=str) print(valid_df.tail()) valid_df['eval_label'] = valid_df['label'].apply( lambda x: ' '.join(list(x))) if args.limit: train_df = train_df.iloc[0:args.limit] valid_df = valid_df.iloc[0:args.limit] # train_df['len'] = train_df['content'].apply(lambda x: len(x)) run_root = Path('../experiments/' + args.run_root) tokenizer = BertTokenizer.from_pretrained("../rbt3") valid_set = TaggerRewriterDataset(valid_df, tokenizer, valid=True) valid_index = np.array(valid_set.valid_index) # np.save('index.npy', valid_index) valid_df = valid_df.reset_index().loc[valid_index, :] ner_index = np.array(valid_set.label_type) == 1 valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=tagger_collate_fn) config = BertConfig.from_json_file('../rbt3/config.json') config.num_labels = 5 # # config.is_decoder = True # decoder = BertModel.from_pretrained("../rbt3", config=config) # encoder = BertModel.from_pretrained("../rbt3") # args.vocab_size = config.vocab_size bert_path = '../rbt3' model = TaggerRewriteModel(config, bert_path) model.cuda() if args.mode == 'train': if run_root.exists() and args.clean: shutil.rmtree(run_root) run_root.mkdir(exist_ok=True, parents=True) (run_root / 'params.json').write_text( json.dumps(vars(args), indent=4, sort_keys=True)) train_set = TaggerRewriterDataset(train_df, tokenizer) # np.save('index.npy', train_set.valid_index) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=tagger_collate_fn) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) t_total = int(len(train_df) * args.n_epochs / args.batch_size) warmup_steps = int(t_total * args.warmup) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total # ) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) model, optimizer = amp.initialize(model, optimizer, opt_level='O2', verbosity=0) train(args, model, optimizer, scheduler, tokenizer, ner_index, train_loader=train_loader, valid_df=valid_df, valid_loader=valid_loader, epoch_length=len(train_df)) elif args.mode == 'validate': model_path = run_root / ('tagger_model-%d.pt' % args.fold) load_model(model, model_path) valid_metrics = validate(model, valid_loader, valid_df, args, tokenizer, ner_index, decode_mode='beam_search') elif args.mode == 'predict': model_path = run_root / ('tagger_model-%d.pt' % args.fold) load_model(model, model_path) valid_metrics = validate(model, valid_loader, valid_df, args, tokenizer, decode_mode='beam_search')
def __init__(self, model_path, use_gpu=True): self.model = BertForTokenClassification.from_pretrained(model_path) self.tokenizer = BertTokenizer.from_pretrained(model_path) self.labels_map = self.model.config.id2label self.use_gpu = True
def main(): #global variables to be used in script PRE_TRAINED_MODEL_NAME = 'bert-base-cased' class_names = ['negative', 'positive'] tokenizer = BertTokenizer.from_pretrained('bert-base-cased') class Model(nn.Module): def __init__(self, *args, **kwargs): super(Model, self).__init__() #develop a class for the Sentiment Classifier class SentimentClassifier(nn.Module): def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_classes) def forward(self, input_ids, attention_mask): _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask) output = self.drop(pooled_output) return self.out(output) #Generate a title for our webpage st.title('Sentiment analysis and product reviews.') #createing a sidebar for our webpage st.sidebar.title("Sentiment Analysis Web App") #little comment for our sidebar section st.sidebar.markdown("😃Is your review positive or negative?😞") #Here we will load the data into a cache to prevent repeated work) @st.cache def load_data(): #Function to pull in data from our Amazon s3 Bucket data = pd.read_csv( 'https://amazonproductdata.s3-us-west-1.amazonaws.com/train.csv') return data #let's ingest our raw data here df = load_data() @st.cache def get_model(): gdown.download( "https://drive.google.com/uc?id=1cz41bp4tf37Mky_R31T41qiSN6ucMjGi", "./assets/model_state_dict.bin", quiet=False) get_model() #A function for loading models incase we include other models later def load_model(filepath): model = SentimentClassifier(len(class_names)) device = torch.device('cpu') model.load_state_dict(torch.load(filepath, map_location=device)) return model #loading model into memory - works locally #model = load_model('./model/BERT_trained_model') #This one works locally! model = load_model('./assets/model_state_dict.bin') #here we have the ability to plot data metrics def plot_metrics(metrics_list): if "Confusion Matrix" in metrics_list: st.subheader("Confusion Matrix") plot_confusion_matrix(model, x_test, y_test, display_labels=class_names) #function to provide inference from BERT model def BERT_inference(review_text): #tokenizer = BertTokenizer.from_pretrained('bert-base-cased') #Now we must encode the use text encoded_review = tokenizer.encode_plus( review_text, max_length=300, add_special_tokens=True, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) input_ids = encoded_review['input_ids'].to(device) attention_mask = encoded_review['attention_mask'].to(device) output = model(input_ids, attention_mask) _, prediction = torch.max(output, dim=1) st.write(f'Review text: {review_text}') st.write(f'Sentiment : {class_names[prediction]}') #sidebar options to add more rich features to our app if st.sidebar.checkbox("Show raw data", False): st.subheader( "Amazon Review Sentiment Analysis. (Polarity Classification)") st.table(df.head(10)) #Generating a textbox for user input if st.sidebar.checkbox("Input text for inference", False): st.subheader( "Amazon Review Dataset for Sentiment Analysis. (Inference Demonstration.)" ) user_input = st.text_area("Please provide a review here.") if user_input: #Let's process the users input print(user_input) BERT_inference(user_input)
print('model_name: ', model_name) if (('RoBerta' in model_name) or ('roberta' in model_name)): from transformers import RobertaTokenizer, RobertaModel tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False) from multi_label_fns import RoBerta_clf model = RoBerta_clf.from_pretrained(model_name, num_labels=NUM_LABELS, output_attentions=False, output_hidden_states=True) print('using RoBerta:', model_name) elif (('Bert' in model_name) or ('bert' in model_name)): from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) from multi_label_fns import Bert_clf model = Bert_clf.from_pretrained(model_name, num_labels=NUM_LABELS, output_attentions=False, output_hidden_states=True) print('using Bert:', model_name) elif (('XLM' in model_name) or ('xlm' in model_name)): from transformers import XLMTokenizer tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) from multi_label_fns import XLM_clf model = XLM_clf.from_pretrained(model_name, num_labels=NUM_LABELS, output_attentions=False,
def __init__(self): super(NLI, self).__init__() self.model = BertForSequenceClassification.from_pretrained('./data/model_en/bert_fine_tuning').cuda() self.tokenizer = BertTokenizer.from_pretrained('./data/model_en/bert_fine_tuning')