def test_sentences(self, sentences): stopFlag = False tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # 평가모드로 변경 self.model.eval() # 문장을 입력 데이터로 변환 inputs, masks = self.convert_input_data(sentences) device = torch.device("cpu") # 데이터를 GPU에 넣음 b_input_ids = inputs.to(device).long() b_input_mask = masks.to(device).long() # 그래디언트 계산 안함 with torch.no_grad(): # Forward 수행 outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask ) logits = np.argmax(outputs[0].to('cpu').numpy(), axis=2) # join bpe split tokens tokens = tokenizer.convert_ids_to_tokens(b_input_ids.to('cpu').numpy()[0]) new_tokens, new_labels = [], [] for token, label_idx in zip(tokens, logits[0]): if stopFlag == True: break if token == '[SEP]': # 패딩 전까지의 출력만을 보기 위해 stopFlag = True new_labels.append(self.tag_dict_decode[label_idx]) new_tokens.append(token) # return logits return new_labels, new_tokens
def __init__(self): super().__init__() self.bert = BertModel.from_pretrained('monologg/kobert') self.linear = nn.Linear(self.bert.config.hidden_size, 2) self.tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') self.batch_size = 64 self.lr = 3e-05
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer #TOK_NAME = "bert-base-multilingual-cased" #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # load my model MODEL_NAME = args.model_dir # model dir. model = BertForSequenceClassification.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset ,test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv('./prediction/submission.csv', index=False)
def infer(test_data, **kwargs): ''' :param test_data: list of noisy sentences :return: list of corrected sentences ''' # 특수문자 지우기 일단 꺼놓음. # test_data = preprocess_noisy_sentence_list(test_data) if args.tokenizer == 'char': tokenizer_ = tokenizer if args.tokenizer == 'kobert': tokenizer_ = KoBertTokenizer.from_pretrained('monologg/kobert') if True: print("I'm beam search infer") prediction = correct_beam(model, tokenizer_, test_data, args, eos=eos, length_limit=0.15) # check = 0 # for idx, pred in enumerate(prediction): # if pred == "그렇게 하면 않지.": # prediction[idx] = '그렇게 하면 안 되지.' # check += 1 # elif pred == "이런 어의 없는 경우를 봤나.": # check += 1 # prediction[idx] = '이런 어이없는 경우를 봤나.' # elif pred == "차는 검정색이 이쁜 거 같애.": # check += 1 # prediction[idx] = '차는 검은색이 이쁜 거 같아.' # # if check == 3: break for i in range(len(test_data)): print("noisy: ", test_data[i]) print("clean: ", prediction[i]) print("======") return prediction else: prediction = correct(model, tokenizer_, test_data, args, eos=eos, length_limit=0.15) for i in range(len(test_data)): print("noisy: ", test_data[i]) print("clean: ", prediction[i]) print("======") return prediction
def __init__(self, model_type, max_len): self.max_len = max_len if model_type == "bert-base-multilingual-cased": self.tokenizer = BertTokenizer.from_pretrained(model_type, do_lower_case=False) elif model_type == "monologg/kobert": self.tokenizer = KoBertTokenizer.from_pretrained(model_type) elif model_type == "etri/korbert": pass
def infer(test_data, **kwargs): ''' :param test_data: list of noisy sentences :return: list of corrected sentences ''' # 특수문자 지우기 일단 꺼놓음. # test_data = preprocess_noisy_sentence_list(test_data) if args.tokenizer == 'kobert': tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') return correct(model, tokenizer, test_data, args, eos=eos, length_limit=0.1)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # load tokenizer TOK_NAME = args.token if TOK_NAME == "monologg/kobert": tokenizer = KoBertTokenizer.from_pretrained(TOK_NAME) else: tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) # load my model bert_config = BertConfig.from_pretrained(TOK_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers model = BertForSequenceClassification(bert_config) model_dir = os.path.join(args.model_dir, args.name) model_path = os.path.join(model_dir, 'best.pth') # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, model, tokenizer, args) test_dataset = RE_Dataset(test_dataset, test_label) model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) # predict answer batch_size = args.batch_size print("Inference Start!!!") pred_answer = inference(model, test_dataset, device, batch_size) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) save_dir = os.path.join(args.output_dir, args.name) os.makedirs(save_dir, exist_ok=True) output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False)
def define_tokenizer(name): if name in [ "bert-base-multilingual-cased", "sangrimlee/bert-base-multilingual-cased-korquad", "kykim/bert-kor-base" ]: return BertTokenizer.from_pretrained(name) elif name in [ "monologg/koelectra-base-v3-discriminator", "kykim/electra-kor-base" ]: return ElectraTokenizer.from_pretrained(name) elif name in ["xlm-roberta-large"]: return XLMRobertaTokenizer.from_pretrained(name) elif name in ["monologg/kobert"]: return KoBertTokenizer.from_pretrained(name) elif name in ["kykim/funnel-kor-base"]: return FunnelTokenizer.from_pretrained(name)
def korean_bert_example(): if False: pretrained_model_name = 'bert-base-multilingual-uncased' #pretrained_model_name = 'bert-base-multilingual-cased' # Not correctly working. tokenizer = BertTokenizer.from_pretrained(pretrained_model_name) else: # REF [site] >> https://github.com/monologg/KoBERT-Transformers from tokenization_kobert import KoBertTokenizer # REF [site] >> https://huggingface.co/monologg pretrained_model_name = 'monologg/kobert' #pretrained_model_name = 'monologg/distilkobert' tokenizer = KoBertTokenizer.from_pretrained(pretrained_model_name) tokens = tokenizer.tokenize('잘해놨습니다') token_ids = tokenizer.convert_tokens_to_ids(tokens) print('Tokens = {}.'.format(tokens)) #print('Token IDs = {}.'.format(token_ids)) model = BertForSequenceClassification.from_pretrained(pretrained_model_name) #-------------------- input_ids = [ tokenizer.encode('내 개는 무척 귀여워.', add_special_tokens=True), tokenizer.encode('내 고양이는 귀여워.', add_special_tokens=True), tokenizer.encode('내 돼지는 너무 작아요.', add_special_tokens=True), ] max_input_len = len(max(input_ids, key=len)) print('Max. input len = {}.'.format(max_input_len)) def convert(x): y = [x[-1]] * max_input_len # TODO [check] >> x[-1] is correct? y[:len(x)] = x return y input_ids = list(map(convert, input_ids)) input_ids = torch.tensor(input_ids) model.eval() with torch.no_grad(): model_outputs = model(input_ids) # Batch size x #labels. print('Model output losses = {}.'.format(model_outputs.loss)) print('Model output logits = {}.'.format(model_outputs.logits))
def convert_input_data(self, sentences): text_CLS = ["[CLS] " + str(txt) + " [SEP]" for txt in sentences] tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # 토크나이징 tokenized_texts = [tokenizer.tokenize(sent) for sent in text_CLS] MAX_LEN = 128 # MAX_LEN 설정 # 임베딩 및 패딩 진행 input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") # 어텐션 마스크 설정 attention_masks = [] for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) # 데이터를 파이토치의 텐서로 변환 inputs = torch.tensor(input_ids) masks = torch.tensor(attention_masks) return inputs, masks
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer #TOK_NAME = "bert-base-multilingual-cased" #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # load my model MODEL_NAME = args.model_dir # model dir. model = BertForSequenceClassification.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. results = [] for result in pred_answer: results.extend(result) results = np.array(results) print(results.shape) output = pd.DataFrame(results, columns=[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42 ]) output.to_csv('./prediction/fold6.csv', index=False)
import torch.optim as optim from transformers import AutoTokenizer from tokenizer_methods import space_tokenizer, char_tokenizer, jamo_split, khaiii_tokenize, mecab, okt, komoran from data_loader import dataloader from model import RNN from utils import train, evaluate, binary_accuracy, epoch_time from tokenization_kobert import KoBertTokenizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') parser = argparse.ArgumentParser(description="하이퍼 파라미터 설정") # huggingface tokenizers bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased").tokenize kobert_tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert").tokenize koelectra_tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v2-discriminator").tokenize # 토크나이저 리스트 tokenizers = [space_tokenizer, char_tokenizer, jamo_split, khaiii_tokenize, mecab, okt, komoran, bert_tokenizer, kobert_tokenizer, koelectra_tokenizer] tokenizer_names = ['space', 'character', 'syllable', 'khaiii', 'mecab', 'okt', 'komoran', 'bert', 'kobert', 'koelectra'] # tokenizers = [kobert_tokenizer] # tokenizer_names = ['kobert'] if __name__ == '__main__': # hyperparameters parser.add_argument('--n_epochs', required=False, default=10, type=int) parser.add_argument('--max_vocab_size', required=False, default=30000, type=int) parser.add_argument('--batch_size', required=False, default=64, type=int)
def __init__(self, model_type, max_len): self.tokenizer = KoBertTokenizer.from_pretrained(model_type, do_lower_case=False) self.max_len = max_len self.ignore_index = torch.nn.CrossEntropyLoss().ignore_index
def korean_table_question_answering_example(): from transformers import pipeline from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer import pandas as pd # REF [site] >> https://github.com/monologg/KoBERT-Transformers from tokenization_kobert import KoBertTokenizer data_dict = { '배우': ['송광호', '최민식', '설경구'], '나이': ['54', '58', '53'], '출연작품수': ['38', '32', '42'], '생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'], } data_df = pd.DataFrame.from_dict(data_dict) if False: # Show the data frame. from IPython.display import display, HTML display(data_df) #print(HTML(data_df.to_html()).data) query = '최민식씨의 나이는?' # REF [site] >> https://huggingface.co/monologg pretrained_model_name = 'monologg/kobert' #pretrained_model_name = 'monologg/distilkobert' if False: # Not working. table_pipeline = pipeline( 'table-question-answering', model=pretrained_model_name, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) elif False: # Not working. #config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False) #model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config) model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) else: # Not correctly working. model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name) ) answer = table_pipeline(data_dict, query) #answer = table_pipeline(data_df, query) print('Answer: {}.'.format(answer))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.") parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.") parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument( "--log_filename", default='./test.log', type=str, required=False, help="Path to log file of the experiment.", ) args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", filename=args.log_filename, level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = KoBertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) # KoBERT tokenizer로 강제 지정 # tokenizer = AutoTokenizer.from_pretrained( # args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, # cache_dir=args.cache_dir if args.cache_dir else None, # **tokenizer_args, # ) # CRF Adding model = BertCRFForTokenClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) # model = AutoModelForTokenClassification.from_pretrained( # args.model_name_or_path, # from_tf=bool(".ckpt" in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir if args.cache_dir else None, # ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: # KoBERT tokenizer로 강제 지정 tokenizer = KoBertTokenizer.from_pretrained(args.output_dir, **tokenizer_args) # tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" # CRF Addition model = BertCRFForTokenClassification.from_pretrained(checkpoint) # model = AutoModelForTokenClassification.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: # KoBERT tokenizer로 강제 지정 tokenizer = KoBertTokenizer.from_pretrained(args.output_dir, **tokenizer_args) # tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) # CRF Addition model = BertCRFForTokenClassification.from_pretrained(args.output_dir) # model = AutoModelForTokenClassification.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split( )[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. print(model_args.model_name_or_path) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = KoBertTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions y_probability = [] if output_mode == "classification": y_probability = predictions predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write( "index\tprediction\tprobability0\tprobability1\n") for index, item in enumerate(predictions): print(F.softmax(torch.from_numpy( y_probability[index]))) if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write( "%d\t%s\t%s\t%s\n" % (index, item, np.array( F.softmax( torch.from_numpy( y_probability[index]))[0]), np.array( F.softmax( torch.from_numpy( y_probability[index]))[1]))) return eval_results
def main(): # from pathlib import Path # print("File Path:", Path(__file__).absolute()) # print("Directory Path:", Path().absolute()) args = get_args() args.n_gpu = 1 # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus")) # # noisy_sents = noisy_sents_1 + noisy_sents_2 # noise_space_ratio = [] # # for sentence in noisy_sents: # noise_space_ratio.append(sentence.count(' ') / len(sentence)) # # clean_space_ratio = [] # for sentence in clean_sents: # clean_space_ratio.append(sentence.count(' ') / len(sentence)) # # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio), # sum(clean_space_ratio) / len(clean_space_ratio))) # ########## # ##for local # args.num_workers=0 # args.train_batch_size = 4 # args.eval_batch_size = 4 # args.eval_interval = 10 # ########## set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) if args.load_vocab != "": tokenizer.load(args.load_vocab) args.vocab_size = tokenizer.__len__() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) if args.mode != 'test' and args.averaging != "": sess = 't0005/rush1-3/37' checkpoints = ["4500", "6500", "7500", "8000"] nsml.load(checkpoint=checkpoints[0], session=sess) args.vocab_size = tokenizer.__len__() print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) params = model.named_parameters() new_dict_params = dict(params) for checkpoint in checkpoints: bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) for name, param in params: new_dict_params[name] += param / len(checkpoints) model.load_state_dict(new_dict_params, strict=False) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.save('best') elif args.mode == 'eval': print("I'm in EVAL") checkpoint = 'best' sess = 't0005/rush1-3/507' nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = tokenizer.__len__() model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) model.eval() #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines() noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_corpus")) valid_noisy = noisy_sents[:1000] prediction = correct_beam(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.15) for i, pred in enumerate(prediction[:1000]): print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred)) # bind_txt(prediction) # nsml.save('prediction') # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f: # for i, pred in enumerate(prediction): # if i%500==0: print(i) # f.write("%s\n" % pred) ## only works when char tokenizer ##TODO: kobert tokenizer, different vocabsize if it is needed elif args.mode != 'test' and args.resubmit != "": checkpoint = 'best' sess = 't0005/rush1-3/' + args.resubmit print(sess) model = None tokenizer = CharTokenizer([]) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) args.vocab_size = len(tokenizer) print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) nsml.load(checkpoint=checkpoint, session=sess) bind_nsml(model, tokenizer, args, eos=eos_setting) ########## testing loaded model & tokenizer ############### # model.eval() # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) # valid_noisy = noisy_sents[-10:] # # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1) # # for pred in prediction: # print(pred) ################## nsml.save("best") else: #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000] # sents_annotation = ['None'] * len(noisy_sents) noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/' + str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") noisy_sents = read_strings( os.path.join('sejong_corpus', args.noisy_file)) clean_sents = read_strings( os.path.join('sejong_corpus', args.clean_file)) # checkpoint = 'generated_data' # sess = 't0005/rush1-1/113' # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data] + semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip( noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = train_test_split( pairs, test_size=args.val_ratio, random_state=args.seed) # test: about 1000 logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") # print("validation: ", valid_data) train_sents = [x['noisy'] for x in train_data ] + [x['clean'] for x in train_data] # train_sents = [x['clean'] for x in train_data] if args.load_model != "" and args.mode == "train": # Load pretrained model print("load pretrained model") model.load_state_dict( torch.load(args.load_model, map_location=args.device)) if args.freeze: model.token_embeddings.weight.requires_grad = False model.decoder_embeddings.weight.requires_grad = False if args.tokenizer == 'char' and args.load_vocab == "": tokenizer = CharTokenizer.from_strings( train_sents, args.vocab_size) print( f'tokenizer loaded from strings. len={len(tokenizer)}.' ) bind_nsml(model, tokenizer, args, eos=eos_setting) if args.tokenizer == 'char' and tokenizer is not None: tokenizer.save('vocab.txt') if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
def __init__(self, args): self.args = args self.tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert") self.sep_vid = self.tokenizer.token2idx[self.tokenizer.sep_token] self.cls_vid = self.tokenizer.token2idx[self.tokenizer.cls_token] self.pad_vid = self.tokenizer.token2idx[self.tokenizer.pad_token]
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M") eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) #train_data, valid_data = None, None if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train": if args.mode == "train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) if args.mode == "semi-train": noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) checkpoint = 'generated_data' sess = 't0005/rush1-1/'+str(args.semi_dataset) # five copy #sess = 't0005/rush1-1/209' # one copy #sess = 't0005/rush1-1/224' semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) semi_sents_annotation = ['None'] * len(semi_noisy_sents) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess) sents_annotation = ['None']*len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 if args.mode == "semi-train": pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)] train_data = pairs[:-args.num_val_data]+semi_pairs valid_data = pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args, eos=eos_setting) else: pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args,eos=eos_setting) ## to load pretrained model nsml.load(checkpoint='best', session='t0005/rush1-2/79') #print(tokenizer.vocab) if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train': train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) if args.tokenizer == 'char': tokenizer = CharTokenizer([]) if args.tokenizer == 'kobert': print("koBERT tokenizer") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') args.vocab_size = tokenizer.vocab_size print(args.vocab_size) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) eos_setting = args.eos_setting bind_nsml(model, tokenizer, args, eos=eos_setting) if args.pause: nsml.paused(scope=locals()) #train_data, valid_data = None, None if args.mode == "train": noisy_sents_labeled = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation_labeled = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents_labeled = read_strings( os.path.join(args.data_dir, "train_label")) noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_corpus")) pairs = noisy_sents pairs_labeled = clean_sents_labeled train_data, valid_data = pairs + noisy_sents_labeled[:-args. num_val_data] + pairs_labeled[:-args.num_val_data], pairs_labeled[ -args. num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") train_sents = [x for x in train_data] if args.tokenizer == 'char': tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) print("===vocab size: ", len(tokenizer)) args.vocab_size = len(tokenizer) model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) bind_nsml(model, tokenizer, args, eos=eos_setting) if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train": train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
from transformers import BertForQuestionAnswering from tokenization_kobert import KoBertTokenizer import os if __name__ == '__main__': parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]') commands_parser = parser.add_subparsers( help='transformers-cli command helpers') # Register commands ServeCommand.register_subcommand(commands_parser) # Let's go args = parser.parse_args() # load model and tokenizer if (os.path.isdir(args.model)): model = BertForQuestionAnswering.from_pretrained('models') if (os.path.isdir(args.tokenizer)): tokenizer = KoBertTokenizer.from_pretrained('models') if not hasattr(args, 'func'): parser.print_help() exit(1) # Run service = args.func(args) service.run()
def train(): # load model and tokenizer #MODEL_NAME = "bert-base-multilingual-cased" MODEL_NAME = 'monologg/kobert' #tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') print(tokenizer.tokenize("이순신은 조선 중기의 무신이다.")) print(tokenizer.tokenize("아버지가방에들어가신다.")) # load dataset #train_dataset = load_data("/opt/ml/input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") #train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values train_dataset, dev_dataset = load_fold(6) train_label = train_dataset['label'].values dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) #train_dataset, dev_dataset = torch.utils.data.random_split(RE_train_dataset, [7000, 2000]) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) #model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=13, # number of total save model. #load_best_model_at_end=True, save_steps=100, # model saving step. num_train_epochs=8, # total number of training epochs learning_rate=5e-5, # learning_rate per_device_train_batch_size=32, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=1000, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_dev_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function ) # train model trainer.train()
def train(data_dir, model_dir, args): seed_everything(args.seed) s_dir = args.model + str(args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(args.epochs) + \ '-' + args.criterion + '-' + args.scheduler + '-' + args.optimizer + '-' + args.dataset + '-' + args.tokenize if args.name: s_dir += '-' + args.name save_dir = increment_path(os.path.join(model_dir, s_dir)) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("This notebook use [%s]." % (device)) # load model and tokenizer MODEL_NAME = args.model if MODEL_NAME == "monologg/kobert": tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") labels = dataset['label'].values # setting model hyperparameter bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) model.dropout = nn.Dropout(p=args.drop) model.to(device) summary(model) # loss & optimizer if args.criterion == 'f1' or args.criterion == 'label_smoothing' or args.criterion == 'f1cross': criterion = create_criterion(args.criterion, classes=args.num_labels, smoothing=0.1) else: criterion = create_criterion(args.criterion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.optimizer == 'AdamP': optimizer = AdamP(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) else: opt_module = getattr(import_module("torch.optim"), args.optimizer) # default: SGD optimizer = opt_module( optimizer_grouped_parameters, lr=args.lr, ) # logging logger = SummaryWriter(log_dir=save_dir) with open(os.path.join(save_dir, 'config.json'), 'w', encoding='utf-8') as f: json.dump(vars(args), f, ensure_ascii=False, indent=4) set_neptune(save_dir, args) # preprocess dataset if args.preprocess != 'no': pre_module = getattr(import_module("preprocess"), args.preprocess) dataset = pre_module(dataset, model, tokenizer) # train, val split kfold = StratifiedKFold(n_splits=5) for train_idx, val_idx in kfold.split(dataset, labels): train_dataset, val_dataset = dataset.loc[train_idx], dataset.loc[ val_idx] break tok_module = getattr(import_module("load_data"), args.tokenize) train_tokenized = tok_module(train_dataset, tokenizer, max_len=args.max_len) val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len) # make dataset for pytorch. RE_train_dataset = RE_Dataset( train_tokenized, train_dataset['label'].reset_index(drop='index')) RE_val_dataset = RE_Dataset(val_tokenized, val_dataset['label'].reset_index(drop='index')) train_loader = DataLoader( RE_train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True, pin_memory=use_cuda, ) val_loader = DataLoader( RE_val_dataset, batch_size=12, num_workers=1, shuffle=False, pin_memory=use_cuda, ) if args.scheduler == 'cosine': scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6) elif args.scheduler == 'reduce': scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5) elif args.scheduler == 'step': scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5) elif args.scheduler == 'cosine_warmup': t_total = len(train_loader) * args.epochs warmup_step = int(t_total * args.warmup_ratio) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) else: scheduler = None print("Training Start!!!") best_val_acc = 0 best_val_loss = np.inf for epoch in range(args.epochs): # train loop model.train() train_loss, train_acc = AverageMeter(), AverageMeter() for idx, train_batch in enumerate(train_loader): optimizer.zero_grad() try: inputs, token_types, attention_mask, labels = train_batch.values( ) inputs = inputs.to(device) token_types = token_types.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, token_type_ids=token_types, attention_mask=attention_mask) except: inputs, attention_mask, labels = train_batch.values() inputs = inputs.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, attention_mask=attention_mask) preds = torch.argmax(outs.logits, dim=-1) loss = criterion(outs.logits, labels) acc = (preds == labels).sum().item() / len(labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7) optimizer.step() if scheduler: scheduler.step() neptune.log_metric('learning_rate', get_lr(optimizer)) train_loss.update(loss.item(), len(labels)) train_acc.update(acc, len(labels)) if (idx + 1) % args.log_interval == 0: current_lr = get_lr(optimizer) print( f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || " f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || lr {current_lr}" ) logger.add_scalar("Train/loss", train_loss.avg, epoch * len(train_loader) + idx) logger.add_scalar("Train/accuracy", train_acc.avg, epoch * len(train_loader) + idx) neptune.log_metric(f'Train_loss', train_loss.avg) neptune.log_metric(f'Train_avg', train_acc.avg) neptune.log_metric('learning_rate', current_lr) val_loss, val_acc = AverageMeter(), AverageMeter() # val loop with torch.no_grad(): print("Calculating validation results...") model.eval() for val_batch in val_loader: try: inputs, token_types, attention_mask, labels = val_batch.values( ) inputs = inputs.to(device) token_types = token_types.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, token_type_ids=token_types, attention_mask=attention_mask) except: inputs, attention_mask, labels = val_batch.values() inputs = inputs.to(device) attention_mask = attention_mask.to(device) labels = labels.to(device) outs = model(input_ids=inputs, attention_mask=attention_mask) preds = torch.argmax(outs.logits, dim=-1) loss = criterion(outs.logits, labels) acc = (preds == labels).sum().item() / len(labels) val_loss.update(loss.item(), len(labels)) val_acc.update(acc, len(labels)) if val_acc.avg > best_val_acc: print( f"New best model for val acc : {val_acc.avg:4.2%}! saving the best model.." ) torch.save(model.state_dict(), f"{save_dir}/best.pth") best_val_acc = val_acc.avg best_val_loss = min(best_val_loss, val_loss.avg) print( f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || " f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}" ) logger.add_scalar("Val/loss", val_loss.avg, epoch) logger.add_scalar("Val/accuracy", val_acc.avg, epoch) neptune.log_metric(f'Val_loss', val_loss.avg) neptune.log_metric(f'Val_avg', val_acc.avg) print()
def __init__(self, max_len: int): self.tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert") self.max_len = max_len self.pad_token_id = 0
import pandas as pd import numpy as np import tensorflow as tf from transformers import TFBertModel, TFDistilBertModel model = TFBertModel.from_pretrained('monologg/kobert', from_pt=True) from tokenization_kobert import KoBertTokenizer tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일 # 0. HyperParameter ------------------ maxlen = 80 batch_size_HP = 48 epochs_HP = 2 classes = 5 # 1. data preprocessing ---------------- X_data = [] Y_data = [] X_test = [] Y_test = [] train_dataset = pd.read_csv("dataset.csv", encoding='utf-8',sep='|') test_dataset = pd.read_csv("testset.csv", encoding='utf-8',sep='|') train_dataset = train_dataset.dropna() test_dataset = test_dataset.dropna() k=0 X_data_ids = [] X_data_attn = [] X_data_seg = []
def train(cfg): results_path = os.path.join('./results', cfg['train_id_name']) if not os.path.exists(results_path): os.mkdir(results_path) os.environ['WANDB_PROJECT'] = 'KLUE_PROJECT' os.environ['WANDB_LOG_MODEL'] = 'true' MODEL_NAME = cfg['model_name'] if MODEL_NAME == 'monologg/kobert': tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) with open('../input/data/train/ner_tags.pickle', 'rb') as f: ner_tokens = pickle.load(f) special_tokens_dict = {'additional_special_tokens': ner_tokens} tokenizer.add_special_tokens(special_tokens_dict) train_dataset = load_data(cfg['train_data_path']) dev_dataset = load_data(cfg['valid_data_path']) train_label = train_dataset['label'].values dev_label = dev_dataset['label'].values tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 # model_config.vocab_size += len(ner_tokens) model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config) model.resize_token_embeddings(len(tokenizer)) model.parameters model.to(device) training_configs = cfg['train_args'] training_args = TrainingArguments( output_dir=results_path, save_total_limit=training_configs['save_total_limit'], save_steps=training_configs['save_steps'], num_train_epochs=training_configs['num_train_epochs'], learning_rate=training_configs['learning_rate'], per_device_train_batch_size=training_configs['per_device_train_batch_size'], per_device_eval_batch_size=training_configs['per_device_eval_batch_size'], warmup_steps=training_configs['warmup_steps'], weight_decay=training_configs['weight_decay'], logging_dir=training_configs['logging_dir'], logging_steps=training_configs['logging_steps'], evaluation_strategy=training_configs['evaluation_strategy'], load_best_model_at_end=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=RE_train_dataset, eval_dataset=RE_dev_dataset, compute_metrics=compute_metrics, callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=cfg['early_stopping_patience'],),] ) transformers.integrations.WandbCallback() print('Start Training.') trainer.train() print('Fininshed Training.')
def _prepare_model(self, args, labels, num_labels, mode='train', model_dir=""): """ prepare model and tokenizer for the trainer. :param args: parsed argument. :param labels: label list of NER. :param num_labels: number of labels. :return: pretrained model, tokenizer. """ # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else model_dir, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS} logger.info("Tokenizer arguments: %s", tokenizer_args) if mode == 'train': tokenizer = KoBertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else model_dir, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) else: tokenizer = KoBertTokenizer.from_pretrained(model_dir, **tokenizer_args) # KoBERT tokenizer로 강제 지정 # tokenizer = AutoTokenizer.from_pretrained( # args.tokenizer_name if args.tokenizer_name else model_dir, # cache_dir=args.cache_dir if args.cache_dir else None, # **tokenizer_args, # ) # CRF Adding if mode == 'train': model = BertCRFForTokenClassification.from_pretrained( model_dir, from_tf=bool(".ckpt" in model_dir), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) else: logger.info("Evaluate the following checkpoints: %s", [model_dir]) model = BertCRFForTokenClassification.from_pretrained(model_dir) # model = AutoModelForTokenClassification.from_pretrained( # args.model_name_or_path, # from_tf=bool(".ckpt" in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir if args.cache_dir else None, # ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) return model, tokenizer
def train(): seed_everything() transformers.logging.set_verbosity_info() parser = argparse.ArgumentParser() # parser.add_argument('--model_name', default='bert-base-multilingual-cased') parser.add_argument('--model_name', default='xlm-roberta-large') parser.add_argument('--version', default='v6', type=str) parser.add_argument('--valid_ratio', type=float, default=0.0) parser.add_argument('--epochs', type=int, default=5) parser.add_argument('--lr', type=float, default=2e-5) parser.add_argument('--adam_eps', type=float, default=1e-8) parser.add_argument('--weight_decay', type=float, default=0.001) parser.add_argument('--warmup_steps', type=int, default=500) parser.add_argument('--batch_size', type=int, default=8) parser.add_argument('--accumulation_steps', type=int, default=1) parser.add_argument('--max_grad_norm', type=float, default=1.0) parser.add_argument('--l2_reg_lambda', type=float, default=5e-3) parser.add_argument('--hidden_dropout_prob', type=float, default=0.2) parser.add_argument('--max_len', type=int, default=150) parser.add_argument('--scheduler_type', type=str, default='cosine') parser.add_argument('--data_type', type=str, default='original') args = parser.parse_args() if not os.path.exists(f'../results/{args.version}'): os.makedirs(f'../results/{args.version}', exist_ok=True) logging.basicConfig(level=logging.INFO) logging.basicConfig(filename=f'../results/{args.version}.log', filemode='w', format='%(asctime)s ==> %(message)s') wandb.init(config=args, project="[Pstage-NLP]", name=args.version, save_code=True) # load model and tokenizer MODEL_NAME = args.model_name if MODEL_NAME == "monologg/kobert": from tokenization_kobert import KoBertTokenizer tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) if args.data_type == "original": train_data_path = "../data/train/train.tsv" elif args.data_type == "extra_v1": train_data_path = "../data/train/train_and_extra_v1.tsv" elif args.data_type == "extra_v2": train_data_path = "../data/train/train_and_extra_v2.tsv" elif args.data_type == "aug": train_data_path = "../data/train/aug_extra_train.tsv" # load dataset total_dataset = load_data2(train_data_path, "../data/label_type.pkl") if args.valid_ratio > 0.0: train_dataset, valid_dataset = train_test_split( total_dataset, test_size=args.valid_ratio, random_state=42, shuffle=True, stratify=total_dataset.label) valid_label = valid_dataset['label'].values valid_features = tokenized_dataset2(valid_dataset, tokenizer, max_length=args.max_len) all_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in valid_features], dtype=torch.long) all_e1_mask = torch.tensor([f.e1_mask for f in valid_features], dtype=torch.long) # add e1 mask all_e2_mask = torch.tensor([f.e2_mask for f in valid_features], dtype=torch.long) # add e2 mask all_label_ids = torch.tensor([f.label_id for f in valid_features], dtype=torch.long) valid_ds = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask) valid_dl = torch.utils.data.DataLoader(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask) evaluation_startegy = 'steps' else: train_dataset = total_dataset xlm = True if MODEL_NAME.startswith("xlm") else False train_features = tokenized_dataset2(train_dataset, tokenizer, xlm=xlm, max_length=args.max_len) # make dataset for pytorch. all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_e1_mask = torch.tensor([f.e1_mask for f in train_features], dtype=torch.long) # add e1 mask all_e2_mask = torch.tensor([f.e2_mask for f in train_features], dtype=torch.long) # add e2 mask all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_ds = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask) train_dl = torch.utils.data.DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=3) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') val_counts = train_dataset.label.value_counts().sort_index().values cls_weight = 1 / np.log1p(val_counts) cls_weight = (cls_weight / cls_weight.sum()) * 42 cls_weight = torch.tensor(cls_weight, dtype=torch.float32).to(device) # setting model hyperparameter model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 model_config.l2_reg_lambda = args.l2_reg_lambda model_config.latent_entity_typing = False if MODEL_NAME.startswith("bert"): model = BertForSequenceClassification(model_config, MODEL_NAME) elif MODEL_NAME.startswith("xlm"): model = XLMRobertaForSequenceClassification(model_config, MODEL_NAME) model.parameters model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_eps) num_training_steps = int( len(train_dl) // args.accumulation_steps * args.epochs) if args.scheduler_type == "cosine": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_training_steps) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_training_steps) logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_dataset)) logging.info(" Num Epochs = %d", args.epochs) logging.info(" Total optimization steps = %d", num_training_steps) wandb.watch(model) global_step = 0 tr_loss = 0.0 model.zero_grad() min_loss = float("INF") train_iterator = trange(int(args.epochs), desc="Epoch") for _ in train_iterator: corrects = 0 total_sample = 0 epoch_iterator = tqdm(train_dl, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3], 'e1_mask': batch[4], 'e2_mask': batch[5], 'cls_weight': cls_weight } outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] pred = outputs[1] _, pred = torch.max(pred, dim=-1) corrects += np.sum((pred == batch[3]).detach().cpu().numpy()) total_sample += batch[0].size(0) tr_acc = corrects / total_sample * 100 if args.accumulation_steps > 1: loss = loss / args.accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 wandb.log({"loss": tr_loss / global_step, "acc": tr_acc}) if global_step % 100 == 0: logging.info("global_step = %s, average loss = %s", global_step, tr_loss / global_step) if min_loss > tr_loss / global_step: logging.info( f"Loss: {min_loss:.6f} -> {tr_loss/global_step:.6f}" ) logging.info("save.") min_loss = tr_loss / global_step save_path = os.path.join( f"../results/{args.version}/checkpoint-best") model.save_pretrained(save_path)
tokened_str = mecab.morphs(line) print(tokened_str) # f.write (' / '.join(tokened_str) + "\n") tokened_str = mecab.nouns(line) print(tokened_str) # f.write (' / '.join(tokened_str) + "\n") tokened_str = mecab.pos(line) print(tokened_str) # for i in tokened_str: # f.write('/'.join(i) + " ") # f.write("\n") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') tokened_str = tokenizer.tokenize("[CLS]" + line + "[SEP]") print(tokened_str) # f.write(' / '.join(tokened_str) + "\n") print(tokenizer.convert_tokens_to_ids(tokened_str)) str = "[" + ', '.join(str(e) for e in tokenizer.convert_tokens_to_ids(tokened_str)) + "]" # f.write (''.join(str)) # f.close() import torch from kobert_transformers import get_distilkobert_model, get_kobert_model model = get_distilkobert_model() #input_ids = torch.LongTensor([[31, 51, 99, 12, 20, 55, 87]]) input_ids = torch.LongTensor([tokenizer.convert_tokens_to_ids(tokened_str)])
def main(): parser = argparse.ArgumentParser( description= "Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)." ) parser.add_argument('--file_path', type=str, default='data/dump.txt', help='The path to the data.') parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2', 'kobert']) parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased', help="The tokenizer to use.") parser.add_argument('--dump_file', type=str, default='data/dump', help='The dump file prefix.') args = parser.parse_args() logger.info(f'Loading Tokenizer ({args.tokenizer_name})') if args.tokenizer_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]` sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` elif args.tokenizer_type == 'roberta': tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map['cls_token'] # `<s>` sep = tokenizer.special_tokens_map['sep_token'] # `</s>` elif args.tokenizer_type == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name) bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>` sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>` elif args.tokenizer_type == 'kobert': tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') bos = tokenizer.special_tokens_map['cls_token'] sep = tokenizer.special_tokens_map['sep_token'] logger.info(f'Loading text from {args.file_path}') with open(args.file_path, 'r', encoding='utf8') as fp: data = fp.readlines() logger.info(f'Start encoding') logger.info(f'{len(data)} examples to process.') rslt = [] iter = 0 interval = 10000 start = time.time() for text in data: text = f'{bos} {text.strip()} {sep}' token_ids = tokenizer.encode(text, add_special_tokens=False) rslt.append(token_ids) iter += 1 if iter % interval == 0: end = time.time() logger.info( f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl' ) start = time.time() logger.info('Finished binarization') logger.info(f'{len(data)} examples processed.') dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle' rslt_ = [np.uint16(d) for d in rslt] random.shuffle(rslt_) logger.info(f'Dump to {dp_file}') with open(dp_file, 'wb') as handle: pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)