def get_tokenizer(self): if self.hparams.model_type == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif self.hparams.model_type == 'bert-cased': tokenizer = BertTokenizer.from_pretrained('bert-base-cased') elif self.hparams.model_type == 'bert-large': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') elif self.hparams.model_type == 'distilbert': tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') elif self.hparams.model_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif self.hparams.model_type == 'roberta-large': tokenizer = RobertaTokenizer.from_pretrained('roberta-large') elif self.hparams.model_type == 'albert': tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') elif self.hparams.model_type == 'albert-xxlarge': tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') elif self.hparams.model_type == 'electra': tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-base-discriminator') elif self.hparams.model_type == 'electra-large': tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-large-discriminator') else: raise ValueError return tokenizer
def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None): if 'roberta' in model_type: tokenizer = RobertaTokenizer.from_pretrained(model_path) config = RobertaConfig.from_pretrained(model_path) config.num_labels = num_labels model = RobertaForSequenceClassification.from_pretrained(model_path, config=config) model.eval() model.to(self.device) elif 'electra_multitask' in model_type: tokenizer = ElectraTokenizer.from_pretrained(model_path) tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']}) config = ElectraConfig.from_pretrained(model_path) config.num_labels = num_labels config.num_regs = num_regs config.vocab_size = len(tokenizer) model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config) model.eval() model.to(self.device) elif 'electra' in model_type: tokenizer = ElectraTokenizer.from_pretrained(model_path) config = ElectraConfig.from_pretrained(model_path) config.num_labels = num_labels model = ElectraForSequenceClassification.from_pretrained(model_path, config=config) model.eval() model.to(self.device) else: raise NotImplementedError() return config, tokenizer, model
def load_model_and_tokenizer(lm, model_dir=None): if lm == 'roberta': model_path = model_dir if model_dir else 'roberta-base' model = RobertaForMatres.from_pretrained(model_path) tokenizer = RobertaTokenizer.from_pretrained(model_path) elif lm == 'bert': model_path = model_dir if model_dir else 'bert-base-uncased' model = BertForMatres.from_pretrained(model_path) tokenizer = BertTokenizer.from_pretrained(model_path) elif lm == 'bert-large': model_path = model_dir if model_dir else 'bert-large-uncased' model = BertForMatres.from_pretrained(model_path) tokenizer = BertTokenizer.from_pretrained(model_path) elif lm == 'electra': model_path = model_dir if model_dir else 'google/electra-base-discriminator' model = ElectraForMatres.from_pretrained(model_path) tokenizer = ElectraTokenizer.from_pretrained(model_path) elif lm == 'electra-large': model_path = model_dir if model_dir else 'google/electra-large-discriminator' model = ElectraForMatres.from_pretrained(model_path) tokenizer = ElectraTokenizer.from_pretrained(model_path) else: raise RuntimeError( "Please specify valid model from {bert, bert-large, roberta, electra, electra-large}." ) return model, tokenizer
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = "monologg/koelectra-base-v3-discriminator" #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) tokenizer = ElectraTokenizer.from_pretrained(TOK_NAME) # load my model MODEL_NAME = args.model_dir # model dir. model = ElectraForSequenceClassification.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset ,test_label) # predict answer logits, predictions = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(predictions, columns=['pred']) output.to_csv('./prediction/koelectra-submission6.csv', index=False)
def test_mutual_dataset(args): tokenizer = ElectraTokenizer.from_pretrained( "google/electra-small-discriminator", do_lower_case=True, use_fast=True) data = MutualDataset(args.data_dir, "train", tokenizer) assert (len(data) == 7088 * 4)
def __init__(self): self.root_path = '..' self.checkpoint_path = f"{self.root_path}/checkpoint" self.save_ckpt_path = f"{self.checkpoint_path}/koelectra-wellnesee-text-classification.pth" model_name_or_path = "monologg/koelectra-base-discriminator" # 답변과 카테고리 불러오기 self.category, self.answer = load_wellness_answer() ctx = "cuda" if torch.cuda.is_available() else "cpu" self.device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(self.save_ckpt_path, map_location=self.device) # Electra Tokenizer self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) electra_config = ElectraConfig.from_pretrained(model_name_or_path) self.model = koElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) self.model.load_state_dict(checkpoint['model_state_dict']) self.model.to(self.device) self.model.eval()
def pred_sm(args): #os.environ['TRANSFORMERS_CACHE'] = os.path.dirname(os.path.realpath(__file__))+'/cache/' model=load_model(device) tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False) if os.path.exists(args.out_infer_file): os.remove(args.out_infer_file) out_infer_folder = os.path.dirname(args.out_infer_file) if not os.path.exists(out_infer_folder): os.makedirs(out_infer_folder) wf = open(args.out_infer_file, "a") resultData = [] fileList = [] files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR') files.sort() for filename in tqdm(files): if os.path.splitext(filename)[1].lower() == '.txt': fullPath = "{}\{}".format(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR', filename).replace('\\', '/') args.infer_file = fullPath test_dataset = load_and_cache_examples(args, tokenizer, mode="infer") preds = evaluate(args, model, test_dataset) filename=filename.split('.txt')[0] text = filename + ':' + str(preds[0]) print(text, file=wf) fileList.append(filename) resultData.append(str(preds[0])) return fileList, resultData
def main(args): GoogleSTT(args.in_wav_folder) model=load_model(device) tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", do_lower_case=False) if os.path.exists(args.out_infer_file): os.remove(args.out_infer_file) out_infer_folder = os.path.dirname(args.out_infer_file) if not os.path.exists(out_infer_folder): os.makedirs(out_infer_folder) wf = open(args.out_infer_file, "a") files = os.listdir(os.path.dirname(os.path.realpath(__file__))+'/tmp_ASR') for filename in tqdm(files): if os.path.splitext(filename)[1].lower() == '.txt': fullPath = "{}\{}".format('tmp_ASR', filename).replace('\\', '/') args.infer_file = fullPath test_dataset = load_and_cache_examples(args, tokenizer, mode="infer") preds = evaluate(args, model, test_dataset) filename=filename.split('.txt')[0] text = filename + ':' + str(preds[0]) print(text, file=wf)
def __init__(self, root_path='../ai/chatbot'): checkpoint_path = f"{root_path}/checkpoint" self.model_path = f"{checkpoint_path}/koelectra-wellness-text-classification.pth" model_name_or_path = "monologg/koelectra-base-discriminator" checkpoint = torch.load(self.model_path, map_location=device) electra_config = ElectraConfig.from_pretrained(model_name_or_path) self.model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) self.model.load_state_dict(checkpoint['model_state_dict']) self.model.to(device) self.model.eval() self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) self.category = [] idx = -1 with open(root_path+'/data/wellness_data_for_text_classification.txt', 'r') as f: while True: line = f.readline() if not line: break datas = line.strip().split("\t") if datas[1] != str(idx): self.category.append(datas[2]) idx += 1
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer # TOK_NAME = "bert-base-multilingual-cased" #TOK_NAME = args.pretrained_model #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) MODEL_NAME = 'google/electra-base-discriminator' tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME) # load my model model_module = getattr(import_module("transformers"), args.model_type + "ForSequenceClassification") model = model_module.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test_with_pororo.txt" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv(args.out_path, index=False)
def get_model_and_tokenizer(model_name, device): save_ckpt_path = CHECK_POINT[model_name] if model_name == "koelectra": model_name_or_path = "monologg/koelectra-base-discriminator" tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) electra_config = ElectraConfig.from_pretrained(model_name_or_path) model = koElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) elif model_name == 'kobert': tokenizer = get_tokenizer() model = KoBERTforSequenceClassfication() if os.path.isfile(save_ckpt_path): checkpoint = torch.load(save_ckpt_path, map_location=device) pre_epoch = checkpoint['epoch'] # pre_loss = checkpoint['loss'] model.load_state_dict(checkpoint['model_state_dict']) print(f"load pretrain from: {save_ckpt_path}, epoch={pre_epoch}") return model, tokenizer
def prediction(text): tokenizer1 = ElectraTokenizer.from_pretrained('google/electra-base-discriminator') PATH = "/mnt/01D557900A25E360/Study/projects/ELECTRA_FINAL-20201210T023739Z-001/ELECTRA_FINAL/state_electra_final_model.pt" model1 = torch.load(PATH, map_location='cpu') line_tokenized = tokenizer1.batch_encode_plus(text,max_length=64,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True) input_ids1 = line_tokenized["input_ids"] attention_masks1 = line_tokenized["attention_mask"] prediction_inputs1 = torch.tensor(input_ids1) prediction_masks1 = torch.tensor(attention_masks1) prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1) prediction_sampler1 = SequentialSampler(prediction_data1) prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=1) predictions = [] for batch in prediction_dataloader1: batch = tuple(t.to('cpu') for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): outputs1 = model1(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) print(outputs1) logits1 = outputs1[0] logits1 = logits1.detach().cpu().numpy() predictions.append(logits1) flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten() return flat_predictions
def main(cli_args): # 파라미터 업데이트 args = AttrDict(vars(cli_args)) args.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" logger = logging.getLogger(__name__) # logger 및 seed 지정 init_logger() set_seed(args) # 모델 불러오기 if args.do_score: tokenizer = ElectraTokenizer.from_pretrained( args.model_name_or_path if args.from_init_weight else os.path.join( args.output_dir, "checkpoint-{}".format(args.checkpoint)), do_lower_case=args.do_lower_case, ) only_scoring(args, tokenizer) else: model, tokenizer = create_model(args) # Running mode에 따른 실행 if args.do_cache: prepro(args, tokenizer) if args.do_train: train(args, model, tokenizer, logger) elif args.do_eval: evaluate(args, model, tokenizer)
def predict_pair(model_args, data_args, training_args): # Set seed set_seed(training_args.seed) if 'roberta' in model_args.model_type: tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = RobertaConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) elif 'electra' in model_args.model_type: tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = ElectraConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) else: # default -> bert tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = BertConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) model.to(training_args.device) test_df = pickle.load(open(data_args.test_data_file, 'rb')) test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type) data_collator = MyDataCollator() if training_args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) model = torch.nn.DataParallel(model) else: n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) sampler = SequentialSampler(test_dataset) print(len(test_dataset)) dataloader = DataLoader( test_dataset, sampler=sampler, batch_size=training_args.eval_batch_size, collate_fn=data_collator, ) model.eval() all_probs = [] for inputs in tqdm(dataloader): for k, v in inputs.items(): inputs[k] = v.to(training_args.device) inputs.pop('labels') with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] probs = torch.softmax(logits, dim=-1) maxp, maxi = torch.max(probs, dim=-1) result = [(_i, _p) for _p, _i in zip(maxp, maxi)] all_probs.extend(result) with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout: for i in range(len(test_df)): fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
def punctuate_electra(input_text, download_dir, model_type="ELECTRA", format="inline"): """Punctuate the input text with the ELECTRA model. Capitalize sentence beginnings.""" get_model(model_type, download_dir) model_path = f"{download_dir}/{model_type}" config = AutoConfig.from_pretrained(model_path) tokenizer = ElectraTokenizer.from_pretrained(model_path) tokenizer.add_tokens(["<NUM>"]) pytorch_model = ElectraForTokenClassification.from_pretrained(model_path) pytorch_model.resize_token_embeddings(len(tokenizer)) punctuation_dict = { "COMMA": ",", "PERIOD": ".", "QUESTIONMARK": "?", "EXCLAMATIONMARK": "!", "COLON": ":", "SEMICOLON": ";", "DASH": "-", } eos_punct = [".", "?", "!"] labels = config.id2label # Read the input and clean of non-printable characters input_list = read_input(input_text, format).split() # split up long lines to not exceed the training sequence length n = 60 text_to_punctuate = [] if len(input_list) > n: line_part = [ " ".join(input_list[x:x + n]) for x in range(0, len(input_list), n) ] text_to_punctuate.extend(line_part) elif len(input_list) == 0: pass else: text_to_punctuate.append(" ".join(input_list)) punctuated_text = [] for t in text_to_punctuate: input_ids = tokenizer(t, return_tensors="pt")["input_ids"] tokens = tokenizer.tokenize(t) predictions = pytorch_model(input_ids) pred_ids = np.argmax( predictions[0].detach().numpy(), axis=2)[0] # Take the first matrix, since only have batch size 1 predictions = [labels[pred_ids[i]] for i in range(1, len(pred_ids))] line_punctuated = iterate(tokens, predictions, eos_punct, punctuation_dict) punctuated_text.append(line_punctuated) return upcase_first_letter(" ".join(punctuated_text))
def __init__(self, args, device): self.args = args self.tokenizer = ElectraTokenizer.from_pretrained(args.model_evaluator) self.model = ElectraModel.from_pretrained( args.model_evaluator).eval().to(device) self.alpha = args.alpha self.device = device self.cos = nn.CosineSimilarity(dim=-1, eps=1e-6)
def _test_ElectraForPreTraining(self, size, large=False): from transformers import ElectraTokenizer, TFElectraForPreTraining tokenizer = ElectraTokenizer.from_pretrained(size) model = TFElectraForPreTraining.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["logits"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def _test_ElectraForMaskedLM(self, size, large=False): from transformers import ElectraTokenizer, TFElectraForMaskedLM tokenizer = ElectraTokenizer.from_pretrained(size) model = TFElectraForMaskedLM.from_pretrained(size) input_dict = tokenizer("The capital of France is [MASK].", return_tensors="tf") input_dict["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] spec, input_dict = self.spec_and_pad(input_dict) outputs = ["logits"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def download_model(outputdir_tokenizer: str, outputdir_pretrained: str): slow_tokenizer = ElectraTokenizer.from_pretrained("bert-base-uncased") print("Save tokenizer to ", outputdir_tokenizer) slow_tokenizer.save_pretrained(outputdir_tokenizer) model = ElectraForQuestionAnswering.from_pretrained( "google/electra-base-discriminator") model.save_pretrained(outputdir_pretrained) print("Save model electra pretrained to", outputdir_pretrained)
def _test_ElectraForQuestionAnswering(self, size, large=False): from transformers import ElectraTokenizer, TFElectraForQuestionAnswering tokenizer = ElectraTokenizer.from_pretrained(size) model = TFElectraForQuestionAnswering.from_pretrained(size) question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_dict = tokenizer(question, text, return_tensors='tf') spec, input_dict = self.spec_and_pad(input_dict, max_length=128) outputs = ["start_logits", "end_logits"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def _test_ElectraForSequenceClassification(self, size, large=False): from transformers import ElectraTokenizer, TFElectraForSequenceClassification tokenizer = ElectraTokenizer.from_pretrained(size) model = TFElectraForSequenceClassification.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") input_dict["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1 spec, input_dict = self.spec_and_pad(input_dict) outputs = ["logits"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def _test_ElectraForTokenClassification(self, size, large=False): from transformers import ElectraTokenizer, TFElectraForTokenClassification tokenizer = ElectraTokenizer.from_pretrained(size) model = TFElectraForTokenClassification.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") # input_ids = input_dict["input_ids"] # input_dict["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) spec, input_dict = self.spec_and_pad(input_dict, max_length=128) outputs = ["logits"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def main(): start = time.time() cuda = torch.device('cuda:0') cpu = torch.device('cpu') device = cuda if torch.cuda.is_available() else cpu electra = 'monologg/koelectra-small-v2-discriminator' # KoELECTRA 모델 로드(v2) model = ElectraModel.from_pretrained(electra).to(device) tokenizer = ElectraTokenizer.from_pretrained(electra) # 문서 데이터 파일 로드 with open(corpus_dir + 'doc_summary4.json', 'r', encoding='utf-8') as f: j_doc = json.load(f) train_sets = j_doc max_available_length = 0 for doc_id in tqdm(train_sets): sents = j_doc[doc_id]['sents'] encoded_sents = [] max_in_sent = 0 for sent in sents: a = tokenizer.encode(sent) if len(a) > max_in_sent: max_in_sent = len(a) encoded_sents.append(a) for i, e in enumerate(encoded_sents): encoded_sents[i] += [0] * (max_in_sent - len(e)) split_unit = 8 results = [] for i in range(0, len(encoded_sents), split_unit): j = min(i + split_unit, len(encoded_sents)) x_inputs = torch.LongTensor(encoded_sents[i:j]).to(device) y = model(x_inputs) y_np = y[0].cpu().detach() results.append(y_np) y_np = torch.cat(results).numpy() np.save(f'summary_embed2/{doc_id}.embed', y_np) # CPU로 실행 시간 테스트 end = time.time() elapsed = end - start print(elapsed) print('Ok.')
def __init__(self, root, mode): self.data = pd.read_csv(os.path.join(root, '{}.csv'.format(mode))) self.mode = mode self.tokenizer = tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-base-discriminator') self.data['text'] = self.data['text'].apply(self.preprocess) if self.mode == 'train': self.data = self.data[self.data['text'] != ''] self.data = self.data[['text', 'target']] self.labels = self.data.target.values self.text = self.data.text.values
def test_transformers_embedding_1(self): from transformers import ElectraModel, ElectraTokenizer weight_path = "google/electra-small-generator" model = ElectraModel.from_pretrained(weight_path) tokenizer = ElectraTokenizer.from_pretrained(weight_path) encoder = TransformersWordPieceEncoder(model, tokenizer) ds = DataSet({'words': ["this is a test . [SEP]".split()]}) encoder.index_datasets(ds, field_name='words') self.assertTrue(ds.has_field('word_pieces')) result = encoder(torch.LongTensor([[1,2,3,4]])) self.assertEqual(result.size(), (1, 4, model.config.hidden_size))
def construct_encoder(self): model = ElectraModel.from_pretrained('google/' + self.model_name, output_hidden_states=True, output_attentions=True) model = ElectraExtractor(model, location=None, heads=None) model.cuda() model = torch.nn.DataParallel(model) model.eval() print('tokenizer', self.model_name) tokenizer = ElectraTokenizer.from_pretrained('google/' + self.model_name, do_lower_case=True) print("Model and tokenzier are constructed!") return model, tokenizer
def test_transformers_embedding_1(self): from transformers import ElectraModel, ElectraTokenizer weight_path = "google/electra-small-generator" vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInRoberta".split()) model = ElectraModel.from_pretrained(weight_path) tokenizer = ElectraTokenizer.from_pretrained(weight_path) embed = TransformersEmbedding(vocab, model, tokenizer, word_dropout=0.1) words = torch.LongTensor([[2, 3, 4, 1]]) result = embed(words) self.assertEqual(result.size(), (1, 4, model.config.hidden_size))
def test_masked_by_flag(self): batch_paths = [ '/client/user1/cuongdev/GenImputation/data/test/electra_G1K_22_hs37d5/corpus_dir/G1K_22_hs37d5_biallelic_test.r0000.b0000.page.gz' ] tokenizer = ElectraTokenizer( vocab_file= '/client/user1/cuongdev/GenImputation/data/train/electra_G1K_22_hs37d5/data_dir/vocab.txt' ) test_dataset = GenNLPMaskedDataset(batch_paths, tokenizer, masked_by_flag=True, only_input=True) pass
def main(): # Config config = TrainConfig() # Fixing Seed pl.seed_everything(config.seed) # Logger logger = logging.getLogger() logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter("[%(asctime)s] %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) # Data Loading... raw_train_instances = load_data(config.train_file_path) raw_dev_instances = load_data(config.dev_file_path) logger.info( f"훈련용 예시 개수:{len(raw_train_instances)}\t 검증용 예시 개수:{len(raw_dev_instances)}" ) tokenizer = ElectraTokenizer.from_pretrained(config.pretrained_model_name, do_lower_case=False) train_dataset = AbusingDataset(raw_train_instances, tokenizer) valid_dataset = AbusingDataset(raw_dev_instances, tokenizer) train_dataloader = DataLoader( train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, collate_fn=AbusingDataset.collate_fn, ) val_dataloader = DataLoader( valid_dataset, batch_size=config.batch_size, num_workers=config.num_workers, collate_fn=AbusingDataset.collate_fn, ) # Lightning lightning_module = AbusingClassifier(config, logger) trainer = pl.Trainer( gpus=config.gpus, max_epochs=config.num_epochs, deterministic=True, weights_save_path=config.save_model_file_prefix, gradient_clip_val=1.0, ) trainer.fit(lightning_module, train_dataloader, val_dataloader)
def __init__(self, configs: dict): # 加载参数字典 self.configs = configs # 设置模型以及路径 if self.configs["model_select"] == "fasttext": self.model_path = self.configs["model_path"]["fasttext"] elif self.configs["model_select"] == "lstm_base": self.model_path = self.configs["model_path"]["lstm_base"] elif self.configs["model_select"] == "lstm_pack": self.model_path = self.configs["model_path"]["lstm_pack"] elif self.configs["model_select"] == "textcnn": self.model_path = self.configs["model_path"]["textcnn"] elif self.configs["model_select"] == "bert": self.model_path = self.configs["model_path"]["bert"] elif self.configs["model_select"] == "electra": self.model_path = self.configs["model_path"]["electra"] elif self.configs["model_select"] == "xlnet": self.model_path = self.configs["model_path"]["xlnet"] # 设置模型名称 self.model_name = self.configs["model_select"] # 设置评估推断的数据文件 self.eval_data_file = self.configs["eval_data_file"] # 设置标签转换路径,需要将预测的label转为真实标签 self.label2index_json_path = self.configs["eval_label_transfer_file"] # 设置token映射词表,在自定义模型中的tokenizer使用 self.token2index_json_path = self.configs["eval_token_transfer_file"] # 设置分词器 if self.model_name in ["fasttext", "lstm_base", "lstm_pack", "textcnn"]: self.tokenizer = SequenceTokenizer(load_json(self.token2index_json_path)) elif self.model_name == "bert": self.tokenizer = BertTokenizer.from_pretrained(self.configs["pretrained_model_path"]["bert"]) elif self.model_name == "electra": self.tokenizer = ElectraTokenizer.from_pretrained(self.configs["pretrained_model_path"]["electra"]) elif self.model_name == "xlnet": self.tokenizer = XLNetTokenizer.from_pretrained(self.configs["pretrained_model_path"]["xlnet"]) # 设置label转换器 self.label_tokenizer = ClassificationLabelTokenizer(load_json(self.label2index_json_path)) # 加载模型 self.model = torch.load(self.model_path)