Exemple #1
0
    def test_sentences(self, sentences):
        stopFlag = False
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        # 평가모드로 변경
        self.model.eval()
        # 문장을 입력 데이터로 변환
        inputs, masks = self.convert_input_data(sentences)

        device = torch.device("cpu")
        # 데이터를 GPU에 넣음
        b_input_ids = inputs.to(device).long()
        b_input_mask = masks.to(device).long()
        # 그래디언트 계산 안함
        with torch.no_grad():
            # Forward 수행
            outputs = self.model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask
                                 )
        logits = np.argmax(outputs[0].to('cpu').numpy(), axis=2)
        # join bpe split tokens
        tokens = tokenizer.convert_ids_to_tokens(b_input_ids.to('cpu').numpy()[0])
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, logits[0]):
            if stopFlag == True:
                break
            if token == '[SEP]':  # 패딩 전까지의 출력만을 보기 위해
                stopFlag = True
            new_labels.append(self.tag_dict_decode[label_idx])
            new_tokens.append(token)

        # return logits
        return new_labels, new_tokens
Exemple #2
0
 def __init__(self):
     super().__init__()
     self.bert = BertModel.from_pretrained('monologg/kobert')
     self.linear = nn.Linear(self.bert.config.hidden_size, 2)
     self.tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
     self.batch_size = 64
     self.lr = 3e-05
Exemple #3
0
def main(args):
  """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  # load tokenizer
  #TOK_NAME = "bert-base-multilingual-cased"  
  #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
  tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

  # load my model
  MODEL_NAME = args.model_dir # model dir.
  model = BertForSequenceClassification.from_pretrained(args.model_dir)
  model.parameters
  model.to(device)

  # load test datset
  test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
  test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
  test_dataset = RE_Dataset(test_dataset ,test_label)

  # predict answer
  pred_answer = inference(model, test_dataset, device)
  # make csv file with predicted answer
  # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

  output = pd.DataFrame(pred_answer, columns=['pred'])
  output.to_csv('./prediction/submission.csv', index=False)
Exemple #4
0
    def infer(test_data, **kwargs):
        '''
        :param test_data: list of noisy sentences
        :return: list of corrected sentences
        '''

        # 특수문자 지우기 일단 꺼놓음.
        # test_data = preprocess_noisy_sentence_list(test_data)

        if args.tokenizer == 'char':
            tokenizer_ = tokenizer

        if args.tokenizer == 'kobert':
            tokenizer_ = KoBertTokenizer.from_pretrained('monologg/kobert')

        if True:
            print("I'm beam search infer")
            prediction = correct_beam(model,
                                      tokenizer_,
                                      test_data,
                                      args,
                                      eos=eos,
                                      length_limit=0.15)
            # check = 0
            # for idx, pred in enumerate(prediction):
            #     if pred == "그렇게 하면 않지.":
            #         prediction[idx] = '그렇게 하면 안 되지.'
            #         check += 1
            #     elif pred == "이런 어의 없는 경우를 봤나.":
            #         check += 1
            #         prediction[idx] = '이런 어이없는 경우를 봤나.'
            #     elif pred == "차는 검정색이 이쁜 거 같애.":
            #         check += 1
            #         prediction[idx] = '차는 검은색이 이쁜 거 같아.'
            #
            #     if check == 3: break

            for i in range(len(test_data)):
                print("noisy: ", test_data[i])
                print("clean: ", prediction[i])
                print("======")

            return prediction
        else:
            prediction = correct(model,
                                 tokenizer_,
                                 test_data,
                                 args,
                                 eos=eos,
                                 length_limit=0.15)

            for i in range(len(test_data)):
                print("noisy: ", test_data[i])
                print("clean: ", prediction[i])
                print("======")

            return prediction
Exemple #5
0
    def __init__(self, model_type, max_len):
        self.max_len = max_len

        if model_type == "bert-base-multilingual-cased":
            self.tokenizer = BertTokenizer.from_pretrained(model_type,
                                                           do_lower_case=False)
        elif model_type == "monologg/kobert":
            self.tokenizer = KoBertTokenizer.from_pretrained(model_type)
        elif model_type == "etri/korbert":
            pass
Exemple #6
0
    def infer(test_data, **kwargs):
        '''
        :param test_data: list of noisy sentences
        :return: list of corrected sentences
        '''

        # 특수문자 지우기 일단 꺼놓음.
        # test_data = preprocess_noisy_sentence_list(test_data)

        if args.tokenizer == 'kobert':
            tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

        return correct(model, tokenizer, test_data, args, eos=eos, length_limit=0.1)
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
    """
    seed_everything(args.seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # load tokenizer
    TOK_NAME = args.token
    if TOK_NAME == "monologg/kobert":
        tokenizer = KoBertTokenizer.from_pretrained(TOK_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    bert_config = BertConfig.from_pretrained(TOK_NAME)
    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers
    model = BertForSequenceClassification(bert_config)

    model_dir = os.path.join(args.model_dir, args.name)
    model_path = os.path.join(model_dir, 'best.pth')

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, model,
                                                 tokenizer, args)
    test_dataset = RE_Dataset(test_dataset, test_label)

    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)

    # predict answer
    batch_size = args.batch_size
    print("Inference Start!!!")
    pred_answer = inference(model, test_dataset, device, batch_size)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    save_dir = os.path.join(args.output_dir, args.name)
    os.makedirs(save_dir, exist_ok=True)
    output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False)
def define_tokenizer(name):
    if name in [
            "bert-base-multilingual-cased",
            "sangrimlee/bert-base-multilingual-cased-korquad",
            "kykim/bert-kor-base"
    ]:
        return BertTokenizer.from_pretrained(name)
    elif name in [
            "monologg/koelectra-base-v3-discriminator",
            "kykim/electra-kor-base"
    ]:
        return ElectraTokenizer.from_pretrained(name)
    elif name in ["xlm-roberta-large"]:
        return XLMRobertaTokenizer.from_pretrained(name)
    elif name in ["monologg/kobert"]:
        return KoBertTokenizer.from_pretrained(name)
    elif name in ["kykim/funnel-kor-base"]:
        return FunnelTokenizer.from_pretrained(name)
Exemple #9
0
def korean_bert_example():
	if False:
		pretrained_model_name = 'bert-base-multilingual-uncased'
		#pretrained_model_name = 'bert-base-multilingual-cased'  # Not correctly working.

		tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
	else:
		# REF [site] >> https://github.com/monologg/KoBERT-Transformers
		from tokenization_kobert import KoBertTokenizer

		# REF [site] >> https://huggingface.co/monologg
		pretrained_model_name = 'monologg/kobert'
		#pretrained_model_name = 'monologg/distilkobert'

		tokenizer = KoBertTokenizer.from_pretrained(pretrained_model_name)

	tokens = tokenizer.tokenize('잘해놨습니다')
	token_ids = tokenizer.convert_tokens_to_ids(tokens)
	print('Tokens = {}.'.format(tokens))
	#print('Token IDs = {}.'.format(token_ids))

	model = BertForSequenceClassification.from_pretrained(pretrained_model_name)

	#--------------------
	input_ids = [
		tokenizer.encode('내 개는 무척 귀여워.', add_special_tokens=True),
		tokenizer.encode('내 고양이는 귀여워.', add_special_tokens=True),
		tokenizer.encode('내 돼지는 너무 작아요.', add_special_tokens=True),
	]
	max_input_len = len(max(input_ids, key=len))
	print('Max. input len = {}.'.format(max_input_len))
	def convert(x):
		y = [x[-1]] * max_input_len  # TODO [check] >> x[-1] is correct?
		y[:len(x)] = x
		return y
	input_ids = list(map(convert, input_ids))
	input_ids = torch.tensor(input_ids)

	model.eval()
	with torch.no_grad():
		model_outputs = model(input_ids)  # Batch size x #labels.

	print('Model output losses = {}.'.format(model_outputs.loss))
	print('Model output logits = {}.'.format(model_outputs.logits))
Exemple #10
0
    def convert_input_data(self, sentences):
        text_CLS = ["[CLS] " + str(txt) + " [SEP]" for txt in sentences]
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        # 토크나이징
        tokenized_texts = [tokenizer.tokenize(sent) for sent in text_CLS]
        MAX_LEN = 128  # MAX_LEN 설정
        # 임베딩 및 패딩 진행
        input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
        # 어텐션 마스크 설정
        attention_masks = []
        for seq in input_ids:
            seq_mask = [float(i > 0) for i in seq]
            attention_masks.append(seq_mask)
        # 데이터를 파이토치의 텐서로 변환
        inputs = torch.tensor(input_ids)
        masks = torch.tensor(attention_masks)

        return inputs, masks
Exemple #11
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    #TOK_NAME = "bert-base-multilingual-cased"
    #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
    tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    model = BertForSequenceClassification.from_pretrained(args.model_dir)
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    results = []
    for result in pred_answer:
        results.extend(result)
    results = np.array(results)
    print(results.shape)

    output = pd.DataFrame(results,
                          columns=[
                              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                              15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
                              27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
                              39, 40, 41, 42
                          ])
    output.to_csv('./prediction/fold6.csv', index=False)
Exemple #12
0
import torch.optim as optim
from transformers import AutoTokenizer

from tokenizer_methods import space_tokenizer, char_tokenizer, jamo_split, khaiii_tokenize, mecab, okt, komoran
from data_loader import dataloader
from model import RNN
from utils import train, evaluate, binary_accuracy, epoch_time
from tokenization_kobert import KoBertTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

parser = argparse.ArgumentParser(description="하이퍼 파라미터 설정")

# huggingface tokenizers
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased").tokenize
kobert_tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert").tokenize
koelectra_tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v2-discriminator").tokenize

# 토크나이저 리스트
tokenizers = [space_tokenizer, char_tokenizer, jamo_split, khaiii_tokenize, mecab, okt, komoran, bert_tokenizer, kobert_tokenizer,
koelectra_tokenizer]
tokenizer_names = ['space', 'character', 'syllable', 'khaiii', 'mecab', 'okt', 'komoran', 'bert', 'kobert', 'koelectra']
# tokenizers = [kobert_tokenizer]
# tokenizer_names = ['kobert']

if __name__ == '__main__':
    
    # hyperparameters
    parser.add_argument('--n_epochs', required=False, default=10, type=int)
    parser.add_argument('--max_vocab_size', required=False, default=30000, type=int)
    parser.add_argument('--batch_size', required=False, default=64, type=int)
 def __init__(self, model_type, max_len):
     self.tokenizer = KoBertTokenizer.from_pretrained(model_type,
                                                      do_lower_case=False)
     self.max_len = max_len
     self.ignore_index = torch.nn.CrossEntropyLoss().ignore_index
Exemple #14
0
def korean_table_question_answering_example():
	from transformers import pipeline
	from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer
	import pandas as pd
	# REF [site] >> https://github.com/monologg/KoBERT-Transformers
	from tokenization_kobert import KoBertTokenizer

	data_dict = {
		'배우': ['송광호', '최민식', '설경구'],
		'나이': ['54', '58', '53'],
		'출연작품수': ['38', '32', '42'],
		'생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'],
	}
	data_df = pd.DataFrame.from_dict(data_dict)

	if False:
		# Show the data frame.
		from IPython.display import display, HTML
		display(data_df)
		#print(HTML(data_df.to_html()).data)

	query = '최민식씨의 나이는?'

	# REF [site] >> https://huggingface.co/monologg
	pretrained_model_name = 'monologg/kobert'
	#pretrained_model_name = 'monologg/distilkobert'

	if False:
		# Not working.

		table_pipeline = pipeline(
			'table-question-answering',
			model=pretrained_model_name,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	elif False:
		# Not working.

		#config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
		#model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config)
		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	else:
		# Not correctly working.

		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name)
		)

	answer = table_pipeline(data_dict, query)
	#answer = table_pipeline(data_df, query)
	print('Answer: {}.'.format(answer))
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--keep_accents",
                        action="store_const",
                        const=True,
                        help="Set this flag if model is trained with accents.")
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.")
    parser.add_argument("--use_fast",
                        action="store_const",
                        const=True,
                        help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")

    parser.add_argument(
        "--log_filename",
        default='./test.log',
        type=str,
        required=False,
        help="Path to log file of the experiment.",
    )

    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        filename=args.log_filename,
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        id2label={str(i): label
                  for i, label in enumerate(labels)},
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)

    tokenizer = KoBertTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        **tokenizer_args,
    )
    # KoBERT tokenizer로 강제 지정

    # tokenizer = AutoTokenizer.from_pretrained(
    #     args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    #     cache_dir=args.cache_dir if args.cache_dir else None,
    #     **tokenizer_args,
    # )

    # CRF Adding
    model = BertCRFForTokenClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    # model = AutoModelForTokenClassification.from_pretrained(
    #     args.model_name_or_path,
    #     from_tf=bool(".ckpt" in args.model_name_or_path),
    #     config=config,
    #     cache_dir=args.cache_dir if args.cache_dir else None,
    # )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        # KoBERT tokenizer로 강제 지정

        tokenizer = KoBertTokenizer.from_pretrained(args.output_dir,
                                                    **tokenizer_args)
        # tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            # CRF Addition
            model = BertCRFForTokenClassification.from_pretrained(checkpoint)
            # model = AutoModelForTokenClassification.from_pretrained(checkpoint)

            model.to(args.device)
            result, _ = evaluate(args,
                                 model,
                                 tokenizer,
                                 labels,
                                 pad_token_label_id,
                                 mode="dev",
                                 prefix=global_step)
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        # KoBERT tokenizer로 강제 지정

        tokenizer = KoBertTokenizer.from_pretrained(args.output_dir,
                                                    **tokenizer_args)
        # tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args)

        # CRF Addition
        model = BertCRFForTokenClassification.from_pretrained(args.output_dir)
        #  model = AutoModelForTokenClassification.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args,
                                       model,
                                       tokenizer,
                                       labels,
                                       pad_token_label_id,
                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "test_predictions.txt")
        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
                example_id = 0
                for line in f:
                    if line.startswith(
                            "-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split(
                        )[0] + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning(
                            "Maximum sequence length exceeded: No prediction for '%s'.",
                            line.split()[0])

    return results
Exemple #16
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    print(model_args.model_name_or_path)
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = KoBertTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (GlueDataset(
        data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="dev",
                                cache_dir=model_args.cache_dir)
                    if training_args.do_eval else None)
    test_dataset = (GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="test",
                                cache_dir=model_args.cache_dir)
                    if training_args.do_predict else None)

    def build_compute_metrics_fn(
            task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            eval_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="dev",
                            cache_dir=model_args.cache_dir))

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir,
                f"eval_results_{eval_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            test_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="test",
                            cache_dir=model_args.cache_dir))

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            y_probability = []
            if output_mode == "classification":
                y_probability = predictions

                predictions = np.argmax(predictions, axis=1)
            output_test_file = os.path.join(
                training_args.output_dir,
                f"test_results_{test_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(
                        test_dataset.args.task_name))
                    writer.write(
                        "index\tprediction\tprobability0\tprobability1\n")
                    for index, item in enumerate(predictions):
                        print(F.softmax(torch.from_numpy(
                            y_probability[index])))
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write(
                                "%d\t%s\t%s\t%s\n" %
                                (index, item,
                                 np.array(
                                     F.softmax(
                                         torch.from_numpy(
                                             y_probability[index]))[0]),
                                 np.array(
                                     F.softmax(
                                         torch.from_numpy(
                                             y_probability[index]))[1])))
    return eval_results
Exemple #17
0
def main():
    # from pathlib import Path
    # print("File      Path:", Path(__file__).absolute())
    # print("Directory Path:", Path().absolute())

    args = get_args()
    args.n_gpu = 1

    # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
    # clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))
    # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus"))
    #
    # noisy_sents = noisy_sents_1 + noisy_sents_2
    # noise_space_ratio = []
    #
    # for sentence in noisy_sents:
    #     noise_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # clean_space_ratio = []
    # for sentence in clean_sents:
    #     clean_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio),
    #                                                             sum(clean_space_ratio) / len(clean_space_ratio)))

    # ##########
    # ##for local
    # args.num_workers=0
    # args.train_batch_size = 4
    # args.eval_batch_size = 4
    # args.eval_interval = 10
    # ##########

    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    if args.load_vocab != "":
        tokenizer.load(args.load_vocab)
        args.vocab_size = tokenizer.__len__()

    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode != 'test' and args.averaging != "":
        sess = 't0005/rush1-3/37'
        checkpoints = ["4500", "6500", "7500", "8000"]

        nsml.load(checkpoint=checkpoints[0], session=sess)
        args.vocab_size = tokenizer.__len__()
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        params = model.named_parameters()
        new_dict_params = dict(params)

        for checkpoint in checkpoints:
            bind_nsml(model, tokenizer, args, eos=eos_setting)
            nsml.load(checkpoint=checkpoint, session=sess)
            for name, param in params:
                new_dict_params[name] += param / len(checkpoints)

        model.load_state_dict(new_dict_params, strict=False)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.save('best')

    elif args.mode == 'eval':
        print("I'm in EVAL")

        checkpoint = 'best'
        sess = 't0005/rush1-3/507'
        nsml.load(checkpoint=checkpoint, session=sess)
        args.vocab_size = tokenizer.__len__()

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        model.eval()
        #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines()
        noisy_sents = read_strings(
            os.path.join(args.data_dir, "train_data", "train_corpus"))
        valid_noisy = noisy_sents[:1000]

        prediction = correct_beam(model,
                                  tokenizer,
                                  valid_noisy,
                                  args,
                                  eos=True,
                                  length_limit=0.15)

        for i, pred in enumerate(prediction[:1000]):
            print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred))

        # bind_txt(prediction)
        # nsml.save('prediction')

        # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f:
        #     for i, pred in enumerate(prediction):
        #         if i%500==0: print(i)
        #         f.write("%s\n" % pred)

    ## only works when char tokenizer
    ##TODO: kobert tokenizer, different vocabsize if it is needed
    elif args.mode != 'test' and args.resubmit != "":
        checkpoint = 'best'
        sess = 't0005/rush1-3/' + args.resubmit
        print(sess)

        model = None
        tokenizer = CharTokenizer([])
        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        args.vocab_size = len(tokenizer)
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        ########## testing loaded model & tokenizer ###############

        # model.eval()
        # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
        # valid_noisy = noisy_sents[-10:]
        #
        # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1)
        #
        # for pred in prediction:
        #     print(pred)

        ##################

        nsml.save("best")

    else:
        #train_data, valid_data = None, None
        if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
            if args.mode == "train":
                # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # sents_annotation = ['None'] * len(noisy_sents)
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

            if args.mode == "semi-train":
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

                checkpoint = 'generated_data'
                sess = 't0005/rush1-1/' + str(args.semi_dataset)
                # five copy
                #sess = 't0005/rush1-1/209'
                # one copy
                #sess = 't0005/rush1-1/224'
                semi_noisy_sents, semi_clean_sents = load_generated_data(
                    checkpoint=checkpoint, session=sess)
                semi_sents_annotation = ['None'] * len(semi_noisy_sents)

            if args.mode == "pretrain":
                print("PRETRAIN MODE ON!!")
                noisy_sents = read_strings(
                    os.path.join('sejong_corpus', args.noisy_file))
                clean_sents = read_strings(
                    os.path.join('sejong_corpus', args.clean_file))
                # checkpoint = 'generated_data'
                # sess = 't0005/rush1-1/113'
                # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
                sents_annotation = ['None'] * len(noisy_sents)

            error_type_counter = Counter()

            for annotation in sents_annotation:
                error_type_counter += Counter(annotation.split(','))

            print(error_type_counter)

            # cleaning noise 버전
            # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
            # original 버전

            if args.mode == "semi-train":
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]
                semi_pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                } for noisy, clean, annot in zip(
                    semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

                train_data = pairs[:-args.num_val_data] + semi_pairs
                valid_data = pairs[-args.num_val_data:]
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                tokenizer = CharTokenizer.from_strings(train_sents,
                                                       args.vocab_size)
                bind_nsml(model, tokenizer, args, eos=eos_setting)

            else:
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]

                train_data, valid_data = train_test_split(
                    pairs, test_size=args.val_ratio,
                    random_state=args.seed)  # test: about 1000
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                # print("validation: ", valid_data)

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                # train_sents = [x['clean'] for x in train_data]

                if args.load_model != "" and args.mode == "train":  # Load pretrained model
                    print("load pretrained model")
                    model.load_state_dict(
                        torch.load(args.load_model, map_location=args.device))

                    if args.freeze:
                        model.token_embeddings.weight.requires_grad = False
                        model.decoder_embeddings.weight.requires_grad = False

                if args.tokenizer == 'char' and args.load_vocab == "":
                    tokenizer = CharTokenizer.from_strings(
                        train_sents, args.vocab_size)
                    print(
                        f'tokenizer loaded from strings. len={len(tokenizer)}.'
                    )

                bind_nsml(model, tokenizer, args, eos=eos_setting)

                if args.tokenizer == 'char' and tokenizer is not None:
                    tokenizer.save('vocab.txt')

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model, dim=1)

        if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
            train(model,
                  tokenizer,
                  train_data,
                  valid_data,
                  args,
                  eos=eos_setting)
Exemple #18
0
 def __init__(self, args):
     self.args = args
     self.tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert")
     self.sep_vid = self.tokenizer.token2idx[self.tokenizer.sep_token]
     self.cls_vid = self.tokenizer.token2idx[self.tokenizer.cls_token]
     self.pad_vid = self.tokenizer.token2idx[self.tokenizer.pad_token]
Exemple #19
0
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M")

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    #train_data, valid_data = None, None
    if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
        if args.mode == "train":
            noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))

        if args.mode == "semi-train":
            noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))

            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/'+str(args.semi_dataset)
            # five copy
            #sess = 't0005/rush1-1/209'
            # one copy
            #sess = 't0005/rush1-1/224'
            semi_noisy_sents, semi_clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
            semi_sents_annotation = ['None'] * len(semi_noisy_sents)

        if args.mode == "pretrain":
            print("PRETRAIN MODE ON!!")
            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/113'
            noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
            sents_annotation = ['None']*len(noisy_sents)

        error_type_counter = Counter()

        for annotation in sents_annotation:
            error_type_counter += Counter(annotation.split(','))

        print(error_type_counter)

        # cleaning noise 버전
        # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
        # original 버전

        if args.mode == "semi-train":
            pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in
                     zip(noisy_sents, clean_sents, sents_annotation)]
            semi_pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in
                     zip(semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

            train_data = pairs[:-args.num_val_data]+semi_pairs
            valid_data = pairs[-args.num_val_data:]
            logger.info(f"# of train data: {len(train_data)}")
            logger.info(f"# of valid data: {len(valid_data)}")

            train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
            tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
            bind_nsml(model, tokenizer, args, eos=eos_setting)

        else:
            pairs = [{"noisy": noisy, "clean": clean, "annotation": annot} for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)]

            train_data, valid_data = pairs[:-args.num_val_data], pairs[-args.num_val_data:]
            logger.info(f"# of train data: {len(train_data)}")
            logger.info(f"# of valid data: {len(valid_data)}")

            train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
            #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
            bind_nsml(model, tokenizer, args,eos=eos_setting)



        ## to load pretrained model
        nsml.load(checkpoint='best', session='t0005/rush1-2/79')
        #print(tokenizer.vocab)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
        train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    #train_data, valid_data = None, None
    if args.mode == "train":
        noisy_sents_labeled = read_strings(
            os.path.join(args.data_dir, "train_data", "train_data"))
        sents_annotation_labeled = read_strings(
            os.path.join(args.data_dir, "train_data", "train_annotation"))
        clean_sents_labeled = read_strings(
            os.path.join(args.data_dir, "train_label"))
        noisy_sents = read_strings(
            os.path.join(args.data_dir, "train_data", "train_corpus"))

        pairs = noisy_sents
        pairs_labeled = clean_sents_labeled

        train_data, valid_data = pairs + noisy_sents_labeled[:-args.
                                                             num_val_data] + pairs_labeled[:-args.num_val_data], pairs_labeled[
                                                                 -args.
                                                                 num_val_data:]
        logger.info(f"# of train data: {len(train_data)}")
        logger.info(f"# of valid data: {len(valid_data)}")

        train_sents = [x for x in train_data]

        if args.tokenizer == 'char':
            tokenizer = CharTokenizer.from_strings(train_sents,
                                                   args.vocab_size)
        print("===vocab size: ", len(tokenizer))
        args.vocab_size = len(tokenizer)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train":
        train(model, tokenizer, train_data, valid_data, args, eos=eos_setting)
Exemple #21
0
from transformers import BertForQuestionAnswering
from tokenization_kobert import KoBertTokenizer

import os

if __name__ == '__main__':
    parser = ArgumentParser('Transformers CLI tool',
                            usage='transformers-cli <command> [<args>]')
    commands_parser = parser.add_subparsers(
        help='transformers-cli command helpers')

    # Register commands
    ServeCommand.register_subcommand(commands_parser)

    # Let's go
    args = parser.parse_args()

    # load model and tokenizer
    if (os.path.isdir(args.model)):
        model = BertForQuestionAnswering.from_pretrained('models')
    if (os.path.isdir(args.tokenizer)):
        tokenizer = KoBertTokenizer.from_pretrained('models')

    if not hasattr(args, 'func'):
        parser.print_help()
        exit(1)

    # Run
    service = args.func(args)
    service.run()
Exemple #22
0
def train():
    # load model and tokenizer
    #MODEL_NAME = "bert-base-multilingual-cased"
    MODEL_NAME = 'monologg/kobert'
    #tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
    print(tokenizer.tokenize("이순신은 조선 중기의 무신이다."))
    print(tokenizer.tokenize("아버지가방에들어가신다."))

    # load dataset
    #train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    #train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values
    train_dataset, dev_dataset = load_fold(6)
    train_label = train_dataset['label'].values
    dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
    #train_dataset, dev_dataset = torch.utils.data.random_split(RE_train_dataset, [7000, 2000])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = BertConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                          config=bert_config)
    #model.parameters
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=13,  # number of total save model.
        #load_best_model_at_end=True,
        save_steps=100,  # model saving step.
        num_train_epochs=8,  # total number of training epochs
        learning_rate=5e-5,  # learning_rate
        per_device_train_batch_size=32,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=1000,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=100,  # evaluation step.
    )
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        eval_dataset=RE_dev_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # define metrics function
    )

    # train model
    trainer.train()
Exemple #23
0
def train(data_dir, model_dir, args):
    seed_everything(args.seed)

    s_dir = args.model + str(args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(args.epochs) + \
            '-' + args.criterion + '-' + args.scheduler + '-' + args.optimizer + '-' + args.dataset + '-' + args.tokenize

    if args.name:
        s_dir += '-' + args.name
    save_dir = increment_path(os.path.join(model_dir, s_dir))

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print("This notebook use [%s]." % (device))

    # load model and tokenizer
    MODEL_NAME = args.model
    if MODEL_NAME == "monologg/kobert":
        tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    labels = dataset['label'].values

    # setting model hyperparameter
    bert_config = BertConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                          config=bert_config)
    model.dropout = nn.Dropout(p=args.drop)
    model.to(device)

    summary(model)

    # loss & optimizer
    if args.criterion == 'f1' or args.criterion == 'label_smoothing' or args.criterion == 'f1cross':
        criterion = create_criterion(args.criterion,
                                     classes=args.num_labels,
                                     smoothing=0.1)
    else:
        criterion = create_criterion(args.criterion)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    if args.optimizer == 'AdamP':
        optimizer = AdamP(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.9, 0.999),
                          weight_decay=args.weight_decay)
    else:
        opt_module = getattr(import_module("torch.optim"),
                             args.optimizer)  # default: SGD
        optimizer = opt_module(
            optimizer_grouped_parameters,
            lr=args.lr,
        )

    # logging
    logger = SummaryWriter(log_dir=save_dir)
    with open(os.path.join(save_dir, 'config.json'), 'w',
              encoding='utf-8') as f:
        json.dump(vars(args), f, ensure_ascii=False, indent=4)

    set_neptune(save_dir, args)

    # preprocess dataset
    if args.preprocess != 'no':
        pre_module = getattr(import_module("preprocess"), args.preprocess)
        dataset = pre_module(dataset, model, tokenizer)

    # train, val split
    kfold = StratifiedKFold(n_splits=5)

    for train_idx, val_idx in kfold.split(dataset, labels):
        train_dataset, val_dataset = dataset.loc[train_idx], dataset.loc[
            val_idx]
        break

    tok_module = getattr(import_module("load_data"), args.tokenize)

    train_tokenized = tok_module(train_dataset,
                                 tokenizer,
                                 max_len=args.max_len)
    val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(
        train_tokenized, train_dataset['label'].reset_index(drop='index'))
    RE_val_dataset = RE_Dataset(val_tokenized,
                                val_dataset['label'].reset_index(drop='index'))

    train_loader = DataLoader(
        RE_train_dataset,
        batch_size=args.batch_size,
        num_workers=4,
        shuffle=True,
        pin_memory=use_cuda,
    )

    val_loader = DataLoader(
        RE_val_dataset,
        batch_size=12,
        num_workers=1,
        shuffle=False,
        pin_memory=use_cuda,
    )

    if args.scheduler == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=1e-6)
    elif args.scheduler == 'reduce':
        scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
    elif args.scheduler == 'step':
        scheduler = StepLR(optimizer, args.lr_decay_step, gamma=0.5)
    elif args.scheduler == 'cosine_warmup':
        t_total = len(train_loader) * args.epochs
        warmup_step = int(t_total * args.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_step,
            num_training_steps=t_total)
    else:
        scheduler = None

    print("Training Start!!!")

    best_val_acc = 0
    best_val_loss = np.inf

    for epoch in range(args.epochs):
        # train loop
        model.train()

        train_loss, train_acc = AverageMeter(), AverageMeter()

        for idx, train_batch in enumerate(train_loader):
            optimizer.zero_grad()

            try:
                inputs, token_types, attention_mask, labels = train_batch.values(
                )
                inputs = inputs.to(device)
                token_types = token_types.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outs = model(input_ids=inputs,
                             token_type_ids=token_types,
                             attention_mask=attention_mask)
            except:
                inputs, attention_mask, labels = train_batch.values()
                inputs = inputs.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outs = model(input_ids=inputs, attention_mask=attention_mask)

            preds = torch.argmax(outs.logits, dim=-1)
            loss = criterion(outs.logits, labels)
            acc = (preds == labels).sum().item() / len(labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7)
            optimizer.step()

            if scheduler:
                scheduler.step()

            neptune.log_metric('learning_rate', get_lr(optimizer))

            train_loss.update(loss.item(), len(labels))
            train_acc.update(acc, len(labels))

            if (idx + 1) % args.log_interval == 0:
                current_lr = get_lr(optimizer)
                print(
                    f"Epoch[{epoch + 1}/{args.epochs}]({idx + 1}/{len(train_loader)}) || "
                    f"training loss {train_loss.avg:.4f} || training accuracy {train_acc.avg:4.2%} || lr {current_lr}"
                )
                logger.add_scalar("Train/loss", train_loss.avg,
                                  epoch * len(train_loader) + idx)
                logger.add_scalar("Train/accuracy", train_acc.avg,
                                  epoch * len(train_loader) + idx)

        neptune.log_metric(f'Train_loss', train_loss.avg)
        neptune.log_metric(f'Train_avg', train_acc.avg)
        neptune.log_metric('learning_rate', current_lr)

        val_loss, val_acc = AverageMeter(), AverageMeter()
        # val loop
        with torch.no_grad():
            print("Calculating validation results...")
            model.eval()

            for val_batch in val_loader:
                try:
                    inputs, token_types, attention_mask, labels = val_batch.values(
                    )
                    inputs = inputs.to(device)
                    token_types = token_types.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    outs = model(input_ids=inputs,
                                 token_type_ids=token_types,
                                 attention_mask=attention_mask)
                except:
                    inputs, attention_mask, labels = val_batch.values()
                    inputs = inputs.to(device)
                    attention_mask = attention_mask.to(device)
                    labels = labels.to(device)
                    outs = model(input_ids=inputs,
                                 attention_mask=attention_mask)

                preds = torch.argmax(outs.logits, dim=-1)
                loss = criterion(outs.logits, labels)
                acc = (preds == labels).sum().item() / len(labels)

                val_loss.update(loss.item(), len(labels))
                val_acc.update(acc, len(labels))

            if val_acc.avg > best_val_acc:
                print(
                    f"New best model for val acc : {val_acc.avg:4.2%}! saving the best model.."
                )
                torch.save(model.state_dict(), f"{save_dir}/best.pth")
                best_val_acc = val_acc.avg
                best_val_loss = min(best_val_loss, val_loss.avg)

            print(
                f"[Val] acc : {val_acc.avg:4.2%}, loss : {val_loss.avg:.4f} || "
                f"best acc : {best_val_acc:4.2%}, best loss : {best_val_loss:.4f}"
            )
            logger.add_scalar("Val/loss", val_loss.avg, epoch)
            logger.add_scalar("Val/accuracy", val_acc.avg, epoch)
            neptune.log_metric(f'Val_loss', val_loss.avg)
            neptune.log_metric(f'Val_avg', val_acc.avg)

            print()
 def __init__(self, max_len: int):
     self.tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert")
     self.max_len = max_len
     self.pad_token_id = 0
Exemple #25
0
import pandas as pd
import numpy as np
import tensorflow as tf

from transformers import TFBertModel, TFDistilBertModel
model = TFBertModel.from_pretrained('monologg/kobert', from_pt=True)
from tokenization_kobert import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') # monologg/distilkobert도 동일

# 0. HyperParameter ------------------

maxlen = 80
batch_size_HP = 48
epochs_HP = 2
classes = 5

# 1. data preprocessing ----------------

X_data = []
Y_data = []
X_test = []
Y_test = []
train_dataset = pd.read_csv("dataset.csv", encoding='utf-8',sep='|')
test_dataset = pd.read_csv("testset.csv", encoding='utf-8',sep='|')
train_dataset = train_dataset.dropna()
test_dataset = test_dataset.dropna()
k=0

X_data_ids = []
X_data_attn = []
X_data_seg = []
Exemple #26
0
def train(cfg):
   
    results_path = os.path.join('./results', cfg['train_id_name'])
    if not os.path.exists(results_path):
      os.mkdir(results_path)

    os.environ['WANDB_PROJECT'] = 'KLUE_PROJECT'
    os.environ['WANDB_LOG_MODEL'] = 'true'

    MODEL_NAME = cfg['model_name']
    
    if MODEL_NAME == 'monologg/kobert':
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    with open('../input/data/train/ner_tags.pickle', 'rb') as f:
        ner_tokens = pickle.load(f)
    special_tokens_dict = {'additional_special_tokens': ner_tokens}
    tokenizer.add_special_tokens(special_tokens_dict)
    
    train_dataset = load_data(cfg['train_data_path'])
    dev_dataset = load_data(cfg['valid_data_path'])
    
    train_label = train_dataset['label'].values
    dev_label = dev_dataset['label'].values
    
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)
    
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42
#     model_config.vocab_size += len(ner_tokens)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
    model.resize_token_embeddings(len(tokenizer))
    model.parameters
    model.to(device)
    
    training_configs = cfg['train_args']

    training_args = TrainingArguments(
                        output_dir=results_path,          
                        save_total_limit=training_configs['save_total_limit'],
                        save_steps=training_configs['save_steps'],
                        num_train_epochs=training_configs['num_train_epochs'],
                        learning_rate=training_configs['learning_rate'],
                        per_device_train_batch_size=training_configs['per_device_train_batch_size'],
                        per_device_eval_batch_size=training_configs['per_device_eval_batch_size'],
                        warmup_steps=training_configs['warmup_steps'],
                        weight_decay=training_configs['weight_decay'],
                        logging_dir=training_configs['logging_dir'],
                        logging_steps=training_configs['logging_steps'],
                        evaluation_strategy=training_configs['evaluation_strategy'],
                        load_best_model_at_end=True,
                        )

    trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=RE_train_dataset,
                    eval_dataset=RE_dev_dataset,
                    compute_metrics=compute_metrics,
                    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=cfg['early_stopping_patience'],),]
                    )
    
    transformers.integrations.WandbCallback()
    
    print('Start Training.')
    trainer.train()
    print('Fininshed Training.')
Exemple #27
0
    def _prepare_model(self, args, labels, num_labels, mode='train', model_dir=""):
        """ prepare model and tokenizer for the trainer.
        :param args: parsed argument.
        :param labels: label list of NER.
        :param num_labels: number of labels.
        :return: pretrained model, tokenizer.
        """
        # Load pretrained model and tokenizer
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

        args.model_type = args.model_type.lower()
        config = AutoConfig.from_pretrained(
            args.config_name if args.config_name else model_dir,
            num_labels=num_labels,
            id2label={str(i): label for i, label in enumerate(labels)},
            label2id={label: i for i, label in enumerate(labels)},
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS}
        logger.info("Tokenizer arguments: %s", tokenizer_args)

        if mode == 'train':
            tokenizer = KoBertTokenizer.from_pretrained(
                args.tokenizer_name if args.tokenizer_name else model_dir,
                cache_dir=args.cache_dir if args.cache_dir else None,
                **tokenizer_args,
            )
        else:
            tokenizer = KoBertTokenizer.from_pretrained(model_dir, **tokenizer_args)
        # KoBERT tokenizer로 강제 지정

        # tokenizer = AutoTokenizer.from_pretrained(
        #     args.tokenizer_name if args.tokenizer_name else model_dir,
        #     cache_dir=args.cache_dir if args.cache_dir else None,
        #     **tokenizer_args,
        # )

        # CRF Adding
        if mode == 'train':
            model = BertCRFForTokenClassification.from_pretrained(
                model_dir,
                from_tf=bool(".ckpt" in model_dir),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None,
            )
        else:
            logger.info("Evaluate the following checkpoints: %s", [model_dir])
            model = BertCRFForTokenClassification.from_pretrained(model_dir)

        # model = AutoModelForTokenClassification.from_pretrained(
        #     args.model_name_or_path,
        #     from_tf=bool(".ckpt" in args.model_name_or_path),
        #     config=config,
        #     cache_dir=args.cache_dir if args.cache_dir else None,
        # )

        if args.local_rank == 0:
            torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

        model.to(args.device)

        logger.info("Training/evaluation parameters %s", args)

        return model, tokenizer
def train():
    seed_everything()
    transformers.logging.set_verbosity_info()

    parser = argparse.ArgumentParser()
    # parser.add_argument('--model_name', default='bert-base-multilingual-cased')
    parser.add_argument('--model_name', default='xlm-roberta-large')
    parser.add_argument('--version', default='v6', type=str)
    parser.add_argument('--valid_ratio', type=float, default=0.0)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--lr', type=float, default=2e-5)
    parser.add_argument('--adam_eps', type=float, default=1e-8)
    parser.add_argument('--weight_decay', type=float, default=0.001)
    parser.add_argument('--warmup_steps', type=int, default=500)
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--accumulation_steps', type=int, default=1)
    parser.add_argument('--max_grad_norm', type=float, default=1.0)
    parser.add_argument('--l2_reg_lambda', type=float, default=5e-3)
    parser.add_argument('--hidden_dropout_prob', type=float, default=0.2)
    parser.add_argument('--max_len', type=int, default=150)
    parser.add_argument('--scheduler_type', type=str, default='cosine')
    parser.add_argument('--data_type', type=str, default='original')

    args = parser.parse_args()

    if not os.path.exists(f'../results/{args.version}'):
        os.makedirs(f'../results/{args.version}', exist_ok=True)
    logging.basicConfig(level=logging.INFO)
    logging.basicConfig(filename=f'../results/{args.version}.log',
                        filemode='w',
                        format='%(asctime)s ==> %(message)s')
    wandb.init(config=args,
               project="[Pstage-NLP]",
               name=args.version,
               save_code=True)

    # load model and tokenizer
    MODEL_NAME = args.model_name
    if MODEL_NAME == "monologg/kobert":
        from tokenization_kobert import KoBertTokenizer
        tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    if args.data_type == "original":
        train_data_path = "../data/train/train.tsv"
    elif args.data_type == "extra_v1":
        train_data_path = "../data/train/train_and_extra_v1.tsv"
    elif args.data_type == "extra_v2":
        train_data_path = "../data/train/train_and_extra_v2.tsv"
    elif args.data_type == "aug":
        train_data_path = "../data/train/aug_extra_train.tsv"

    # load dataset
    total_dataset = load_data2(train_data_path, "../data/label_type.pkl")
    if args.valid_ratio > 0.0:
        train_dataset, valid_dataset = train_test_split(
            total_dataset,
            test_size=args.valid_ratio,
            random_state=42,
            shuffle=True,
            stratify=total_dataset.label)
        valid_label = valid_dataset['label'].values
        valid_features = tokenized_dataset2(valid_dataset,
                                            tokenizer,
                                            max_length=args.max_len)
        all_input_ids = torch.tensor([f.input_ids for f in valid_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in valid_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in valid_features],
                                       dtype=torch.long)
        all_e1_mask = torch.tensor([f.e1_mask for f in valid_features],
                                   dtype=torch.long)  # add e1 mask
        all_e2_mask = torch.tensor([f.e2_mask for f in valid_features],
                                   dtype=torch.long)  # add e2 mask
        all_label_ids = torch.tensor([f.label_id for f in valid_features],
                                     dtype=torch.long)
        valid_ds = TensorDataset(all_input_ids, all_input_mask,
                                 all_segment_ids, all_label_ids, all_e1_mask,
                                 all_e2_mask)
        valid_dl = torch.utils.data.DataLoader(all_input_ids, all_input_mask,
                                               all_segment_ids, all_label_ids,
                                               all_e1_mask, all_e2_mask)
        evaluation_startegy = 'steps'
    else:
        train_dataset = total_dataset

    xlm = True if MODEL_NAME.startswith("xlm") else False
    train_features = tokenized_dataset2(train_dataset,
                                        tokenizer,
                                        xlm=xlm,
                                        max_length=args.max_len)

    # make dataset for pytorch.
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_e1_mask = torch.tensor([f.e1_mask for f in train_features],
                               dtype=torch.long)  # add e1 mask
    all_e2_mask = torch.tensor([f.e2_mask for f in train_features],
                               dtype=torch.long)  # add e2 mask
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    train_ds = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                             all_label_ids, all_e1_mask, all_e2_mask)
    train_dl = torch.utils.data.DataLoader(train_ds,
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=3)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    val_counts = train_dataset.label.value_counts().sort_index().values
    cls_weight = 1 / np.log1p(val_counts)
    cls_weight = (cls_weight / cls_weight.sum()) * 42
    cls_weight = torch.tensor(cls_weight, dtype=torch.float32).to(device)

    # setting model hyperparameter
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42
    model_config.l2_reg_lambda = args.l2_reg_lambda
    model_config.latent_entity_typing = False
    if MODEL_NAME.startswith("bert"):
        model = BertForSequenceClassification(model_config, MODEL_NAME)
    elif MODEL_NAME.startswith("xlm"):
        model = XLMRobertaForSequenceClassification(model_config, MODEL_NAME)
    model.parameters
    model.to(device)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            eps=args.adam_eps)

    num_training_steps = int(
        len(train_dl) // args.accumulation_steps * args.epochs)
    if args.scheduler_type == "cosine":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=num_training_steps)
    else:
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=num_training_steps)

    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", len(train_dataset))
    logging.info("  Num Epochs = %d", args.epochs)
    logging.info("  Total optimization steps = %d", num_training_steps)

    wandb.watch(model)
    global_step = 0
    tr_loss = 0.0
    model.zero_grad()

    min_loss = float("INF")
    train_iterator = trange(int(args.epochs), desc="Epoch")
    for _ in train_iterator:
        corrects = 0
        total_sample = 0
        epoch_iterator = tqdm(train_dl, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels': batch[3],
                'e1_mask': batch[4],
                'e2_mask': batch[5],
                'cls_weight': cls_weight
            }

            outputs = model(**inputs)
            # model outputs are always tuple in pytorch-transformers (see doc)
            loss = outputs[0]
            pred = outputs[1]
            _, pred = torch.max(pred, dim=-1)
            corrects += np.sum((pred == batch[3]).detach().cpu().numpy())
            total_sample += batch[0].size(0)
            tr_acc = corrects / total_sample * 100

            if args.accumulation_steps > 1:
                loss = loss / args.accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()

            if (step + 1) % args.accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                wandb.log({"loss": tr_loss / global_step, "acc": tr_acc})

                if global_step % 100 == 0:
                    logging.info("global_step = %s, average loss = %s",
                                 global_step, tr_loss / global_step)

                    if min_loss > tr_loss / global_step:
                        logging.info(
                            f"Loss: {min_loss:.6f} -> {tr_loss/global_step:.6f}"
                        )
                        logging.info("save.")
                        min_loss = tr_loss / global_step

                        save_path = os.path.join(
                            f"../results/{args.version}/checkpoint-best")
                        model.save_pretrained(save_path)
Exemple #29
0
tokened_str = mecab.morphs(line)
print(tokened_str)
# f.write (' / '.join(tokened_str) + "\n")

tokened_str = mecab.nouns(line)
print(tokened_str)
# f.write (' / '.join(tokened_str) + "\n")

tokened_str = mecab.pos(line)
print(tokened_str)
# for i in tokened_str:
#     f.write('/'.join(i) + "  ")
# f.write("\n")

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
tokened_str = tokenizer.tokenize("[CLS]" + line + "[SEP]")
print(tokened_str)
# f.write(' / '.join(tokened_str) + "\n")

print(tokenizer.convert_tokens_to_ids(tokened_str))
str = "[" + ', '.join(str(e) for e in tokenizer.convert_tokens_to_ids(tokened_str)) + "]"
# f.write (''.join(str))
# f.close()

import torch
from kobert_transformers import get_distilkobert_model, get_kobert_model

model = get_distilkobert_model()
#input_ids = torch.LongTensor([[31, 51, 99, 12, 20, 55, 87]])
input_ids = torch.LongTensor([tokenizer.convert_tokens_to_ids(tokened_str)])
Exemple #30
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
    )
    parser.add_argument('--file_path',
                        type=str,
                        default='data/dump.txt',
                        help='The path to the data.')
    parser.add_argument('--tokenizer_type',
                        type=str,
                        default='bert',
                        choices=['bert', 'roberta', 'gpt2', 'kobert'])
    parser.add_argument('--tokenizer_name',
                        type=str,
                        default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file',
                        type=str,
                        default='data/dump',
                        help='The dump file prefix.')
    args = parser.parse_args()

    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
    if args.tokenizer_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token']  # `[CLS]`
        sep = tokenizer.special_tokens_map['sep_token']  # `[SEP]`
    elif args.tokenizer_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token']  # `<s>`
        sep = tokenizer.special_tokens_map['sep_token']  # `</s>`
    elif args.tokenizer_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['bos_token']  # `<|endoftext|>`
        sep = tokenizer.special_tokens_map['eos_token']  # `<|endoftext|>`
    elif args.tokenizer_type == 'kobert':
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        bos = tokenizer.special_tokens_map['cls_token']
        sep = tokenizer.special_tokens_map['sep_token']

    logger.info(f'Loading text from {args.file_path}')
    with open(args.file_path, 'r', encoding='utf8') as fp:
        data = fp.readlines()

    logger.info(f'Start encoding')
    logger.info(f'{len(data)} examples to process.')

    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
        text = f'{bos} {text.strip()} {sep}'
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        rslt.append(token_ids)

        iter += 1
        if iter % interval == 0:
            end = time.time()
            logger.info(
                f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl'
            )
            start = time.time()
    logger.info('Finished binarization')
    logger.info(f'{len(data)} examples processed.')

    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f'Dump to {dp_file}')
    with open(dp_file, 'wb') as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)