Beispiel #1
0
    def __init__(self, test_file, vocab, max_token_cnt=300):
        self.tokenizer = SentencepieceTokenizer(get_tokenizer())
        self.vocab = vocab

        self.max_token_cnt = max_token_cnt

        self.media_map = {
            '경기일보': 0,
            '광양신문': 1,
            '광주매일신문': 2,
            '광주일보': 3,
            '국제신문': 4,
            '기호일보': 5,
            '남도일보': 6,
            '당진시대': 7,
            '대구신문': 8,
            '대구일보': 9,
            '대전일보': 10
        }
        print("medias", self.media_map)

        samples = []
        with jsonlines.open(test_file) as f:
            for line in f.iter():
                media = line['media']
                id = line['id']
                sentences = []
                for i, sentence in enumerate(line['article_original']):
                    sentences.append(sentence.replace('\n', '').strip())
                samples.append([sentences, media, id])
        self.samples = samples
Beispiel #2
0
def predict(model, text):
    device = torch.device("cuda:0")
    max_len = 64
    batch_size = 64
    warmup_ratio = 0.1
    num_epochs = 2
    max_grad_norm = 1
    log_interval = 200 
    learning_rate =  5e-5
    
    tokenizer = get_tokenizer()
    bertmodel, vocab = get_pytorch_kobert_model()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    data_test = BERTDataset(text, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)
    model.eval()
    
    answer=[]
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        max_vals, max_indices = torch.max(out, 1)
        answer.append(max_indices.cpu().clone().numpy())
    
    result = F.softmax(out)
    
    print(result)
    return result
def generate_subword_script(dataset_path,
                            new_path,
                            script_prefix,
                            use_pretrain_kobert_tokenizer=False):
    print('create_subword_script...')

    if use_pretrain_kobert_tokenizer:
        tok_path = get_tokenizer()
        sp = SentencepieceTokenizer(tok_path)

    else:
        sp = spm.SentencePieceProcessor()
        vocab_file = "aihub_sentencepiece.model"
        sp.load(vocab_file)

    for folder in os.listdir(dataset_path):
        # folder : {KsponSpeech_01, ..., KsponSpeech_05}
        path = os.path.join(dataset_path, folder)
        for subfolder in os.listdir(path):
            path = os.path.join(dataset_path, folder, subfolder)
            for file in os.listdir(path):
                with open(os.path.join(path, file), "r",
                          encoding='cp949') as f:
                    sentence = f.read()

                if use_pretrain_kobert_tokenizer:
                    encode = sp(sentence)
                else:
                    encode = sp.encode_as_ids(sentence)

                with open(os.path.join(new_path, script_prefix + file[12:]),
                          "w",
                          encoding='cp949') as f:
                    f.write(" ".join(map(str, encode)))
 def __init__(self):
     self.tok_path = get_tokenizer()
     self.sp = SentencepieceTokenizer(self.tok_path)
     self.v_dimension = 300
     self.v_window = 8
     self.hangul = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣]+")
     self.mecab = Mecab()
Beispiel #5
0
 def __init__(self, name):
     self.name = name
     self.token2index = {}
     self.index2token = {}
     self.n_tokens = 0
     tok_path = get_tokenizer()
     self.sp = SentencepieceTokenizer(tok_path)
def get_kobert_model_and_tokenizer():
    tok_path = get_tokenizer()
    basic_tokenizer = SentencepieceTokenizer(tok_path)
    bert_base, vocab = get_pytorch_kobert_model()
    kobert_tokenizer = KoBertTokenizer(basic_tokenizer, vocab)

    return bert_base, kobert_tokenizer
Beispiel #7
0
def chat(model_params, sent='0'):
    tok_path = get_tokenizer()
    model, vocab = get_mxnet_kobert_model(ctx=ctx)
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    kogptqa = KoGPT2Chat(model)
    kogptqa.load_parameters(model_params, ctx=ctx)
    sent_tokens = tok(sent)
    while 1:
        q = input('user > ').strip()
        if q == 'quit':
            break
        q_tok = tok(q)
        a = ''
        a_tok = []
        while 1:
            input_ids = mx.nd.array([vocab[U_TKN]] + vocab[q_tok] +
                                    vocab[EOS, SENT] + vocab[sent_tokens] +
                                    vocab[EOS, S_TKN] +
                                    vocab[a_tok]).expand_dims(axis=0)
            pred = kogptqa(input_ids.as_in_context(ctx))
            gen = vocab.to_tokens(
                mx.nd.argmax(
                    pred,
                    axis=-1).squeeze().astype('int').asnumpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace('▁', ' ')
            a_tok = tok(a)
        print("Simsimi > {}".format(a.strip()))
Beispiel #8
0
	def __init__(self, input_path_or_input_list, output_path):
		# load file to process
		if isinstance(input_path_or_input_list, str): # if a path is given as path string
			self.file = open(input_path_or_input_list,'rt',encoding='utf8')
		else: # if a path is given as list
			self.file = input_path_or_input_list
		self.output_path = output_path
		self.is_filetype = lambda x: any([isinstance(x, io.TextIOBase),
											isinstance(x, io.BufferedIOBase),
											isinstance(x, io.RawIOBase),
											isinstance(x, io.IOBase)])

		# tokenizer
		tok_path = get_tokenizer()
		self.tokenizer = SentencepieceTokenizer(tok_path)

		# rule set
		with open(config.post_process_rule_path,'rt',encoding='utf8') as f:
			self.rules = dict(map(lambda x:tuple(x.strip('\n').split('\t')),f))


		#dict to store (x,y,y_pred) triplet
		self.idx_map = ['x','y','y_pred']
		self.inst_dict = {}

		# numbers / hipen
		self.num_2_txt = {'^\(1\)':['우선,','먼저,','처음으로,'],
						  '^\(2\)':['두 번째로,', '이어서,','다음으로,'],
						  '^\(3\)':['세 번째로,','이어서,','다음으로,'],
						  '^\(4\)':['네 번째로,','이어서,','다음으로,'],
						  '^\(5\)':['다섯 번째로,','이어서,','다음으로,'],
						  '^\(6\)':['여섯 번째로,','이어서,','다음으로,']
                          }
Beispiel #9
0
    def __init__(
        self,
        data_path,
    ) -> None:

        with open(data_path, 'rb') as f:
            data = pickle.load(f)
        self.src_tokens = data["src_tokens"]
        self.tgt_tokens = data["tgt_tokens"]

        self.src_string = data["src_raw"]
        self.tgt_string = data["tgt_raw"]
        self.ext_labels = data["ext_labels"]

        self.vocab = get_kobert_vocab()
        self.tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(),
                                                  self.vocab,
                                                  lower=False)

        self.pad_idx = self.vocab["[PAD]"]
        self.cls_idx = self.vocab["[CLS]"]
        self.sep_idx = self.vocab["[SEP]"]
        self.mask_idx = self.vocab["[MASK]"]
        self.bos_idx = self.vocab["[BOS]"]
        self.eos_idx = self.vocab["[EOS]"]
def main():
    nsmc_home_dir = 'NSMC_DIR'
    train_file = nsmc_home_dir + '/ratings_train.txt'  # 150K
    test_file = nsmc_home_dir + '/ratings_test.txt'  # 50K

    model, vocab = get_pytorch_kobert_model(
        ctx='cuda' if torch.cuda.is_available() else 'cpu')

    lr = 5e-5
    batch_size = 16
    epochs = 5
    dropout_rate = 0.1
    max_grad_norm = 1.0
    num_total_steps = math.ceil(150000 / batch_size) * epochs
    num_warmup_steps = num_total_steps // 10
    log_interval = 100
    seed = 2019
    num_workers = 2
    num_classes = 2
    pooler_out_dim = model.pooler.dense.out_features

    torch.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print('device', device)

    tok_path = get_tokenizer()
    sp = SentencepieceTokenizer(tok_path)

    train_loader = torch.utils.data.DataLoader(MovieDataset(
        get_data(train_file, vocab, sp)),
                                               shuffle=True,
                                               batch_size=batch_size,
                                               num_workers=num_workers,
                                               collate_fn=batchify,
                                               pin_memory=True)

    test_loader = torch.utils.data.DataLoader(MovieDataset(
        get_data(test_file, vocab, sp)),
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              collate_fn=batchify,
                                              pin_memory=True)

    linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device)

    all_params = list(model.parameters()) + list(linear.parameters())
    optimizer = AdamW(all_params, lr=lr, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_total_steps)
    for epoch in range(epochs):
        train(train_loader, device, model, linear, all_params, optimizer,
              scheduler, dropout_rate, max_grad_norm, log_interval, epoch)
        print(datetime.now(), 'Testing...')
        test(test_loader, device, model, linear)
Beispiel #11
0
def get_sentimentLabel(input_text, time_info):
    try:
        print("2. predict sentiment label")
        device = torch.device("cpu")
        bertmodel, vocab = get_pytorch_kobert_model()
        tokenizer = get_tokenizer()
        tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
        weights = torch.load('weight/bert_weight.pth',
                             map_location=torch.device('cpu'))
        model.load_state_dict(weights)
        model = model.to(device)
        model.eval()

        essay = pd.DataFrame(input_text)
        essay['label'] = 1
        save_link = "Data/{}.txt".format(time_info)
        essay.to_csv(save_link, sep='\t', index_label='idx')
        dataset_sentences = nlp.data.TSVDataset(save_link,
                                                field_indices=[1, 2],
                                                num_discard_samples=1)
        data_sentences = BERTDataset(dataset_sentences, 0, 1, tok, 100, True,
                                     False)  # max_len (100)
        sentences_dataloader = torch.utils.data.DataLoader(
            data_sentences, batch_size=len(data_sentences), num_workers=5)

        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids,
                           label) in enumerate(sentences_dataloader):
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                label = label.long().to(device)
                valid_length = valid_length
                outputs = model(token_ids, valid_length, segment_ids)
        pred_test = outputs
        arr = np.array(pred_test.tolist())
        arr = ne.evaluate("exp(arr)")

        label_dic = dict([(0, 'anger'), (1, 'fear'), (2, 'happiness'),
                          (3, 'miss'), (4, 'sadness'), (5, 'surprised'),
                          (6, 'worry')])
        for i in range(7):
            essay[label_dic[i]] = [proba[i] for proba in arr]
        essay['label'] = list(map(np.argmax, arr))
        indices = np.array(list(map(
            np.max, arr))).argsort()[::-1][0:min(len(essay), 10)]
        prob = essay.iloc[indices].sum(axis=0)[2:].astype(float)
        prob['happiness'] *= 0.6
        prob['fear'] *= 0.8
        prob['worry'] *= 2
        result = prob.idxmax()
        if result == 'fear':
            result = 'sadness'
        return result
    except:
        raise Sentiment_Error()
Beispiel #12
0
    def load_model(self):
        self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device)
        self.model = BERTClassifier(self.bert_model, dr_rate=self.dropout_rt).to(self.device)

        self.model.load_state_dict(torch.load(self.save_path, map_location=self.device))

        self.tokenizer = get_tokenizer()
        self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer, self.vocab, lower=False)

        self.line_converter = Converter(self.token, self.max_len, self.pad, self.pair, self.device)
def main(args):
    for arg in vars(args):
        print(arg, getattr(args, arg))
    tokenizer = SentencepieceTokenizer(get_tokenizer())

    lines_len = 0
    src_docs = []
    with jsonlines.open(args.train_file) as f:

        for line in f.iter():
            lines_len += 1
            sentences = []
            for sentence in line['article_original']:
                sentences.append(sentence)
            src_docs.append(" ".join(sentences).replace('\n', '') + "\n")

    lens = []
    tr_max_src = 0
    for i, src_doc in enumerate(src_docs):
        if i % 100 == 0:
            print(i, len(src_docs))
        tokens = tokenizer(src_doc)
        cur_len = len(tokens)
        lens.append(cur_len)
        if tr_max_src < cur_len:
            tr_max_src = cur_len

    src_docs = []
    with jsonlines.open(args.test_file) as f:

        for line in f.iter():
            lines_len += 1
            sentences = []
            for sentence in line['article_original']:
                sentences.append(sentence)
            src_docs.append(" ".join(sentences).replace('\n', '') + "\n")

    max_src = 0
    test_lens = []
    for i, src_doc in enumerate(src_docs):
        if i % 100 == 0:
            print(i, len(src_docs))
        tokens = tokenizer(src_doc)
        cur_len = len(tokens)
        test_lens.append(cur_len)
        if max_src < cur_len:
            max_src = cur_len
    print("max source length train", tr_max_src)
    print("max source length test", max_src)
    print(sum(lens) / len(lens))
    print(sum(test_lens) / len(test_lens))
    import numpy as np
    print(np.median(np.array(lens)))
    print(np.median(np.array(test_lens)))
    print("done")
Beispiel #14
0
    def __init__(self, filename, vocab, maxlen, use_emotion):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, header=0, encoding='utf-8')
        # self.df = pd.read_csv(filename, delimiter = '\t')
        self.sp = SentencepieceTokenizer(get_tokenizer())
        self.vocab = vocab
        self.maxlen = maxlen
        self.use_emotion = use_emotion

        self.sp.tokens.index('!')
Beispiel #15
0
def test_loader(dtls, max_len, batch_size, num_workers):
    tokenizer = get_tokenizer()
    _, vocab = get_pytorch_kobert_model()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    data_test = BERTDataset(dtls, 0, 1, tok, max_len, True, False)

    test_dataloader = torch.utils.data.DataLoader(data_test,
                                                  batch_size=batch_size,
                                                  num_workers=num_workers)

    return test_dataloader
Beispiel #16
0
    def __init__(self):

        self.device = torch.device("cuda:0")
        self.bertmodel, self.vocab = get_pytorch_kobert_model()

        bertmodel, vocab = get_pytorch_kobert_model()

        # 토큰화
        tokenizer = get_tokenizer()
        self.tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        self.max_len = 64
        self.batch_size = 64
Beispiel #17
0
    def load_model(self):
        self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device)
        self.model = BERTClassifier(self.bert_model,
                                    dr_rate=self.dropout_rt).to(self.device)
        if self.get_weights:
            print("get model from pretrained weigths")
            self.model.load_state_dict(
                torch.load(self.model_save_path, map_location=self.device))

        self.tokenizer = get_tokenizer()
        self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer,
                                                   self.vocab,
                                                   lower=False)
Beispiel #18
0
    def __init__(self, train_path, test_path, kaggle_path, use_all):
        device = torch.device("cuda:0")

        bertmodel, vocab = get_pytorch_kobert_model()
        self.model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

        dataset_train = nlp.data.TSVDataset(train_path,
                                            field_indices=[1, 2],
                                            num_discard_samples=1)
        dataset_test = nlp.data.TSVDataset(test_path,
                                           field_indices=[1, 2],
                                           num_discard_samples=1)
        dataset_kaggle = nlp.data.TSVDataset(kaggle_path,
                                             field_indices=[1],
                                             num_discard_samples=1,
                                             encoding='cp949')

        tokenizer = get_tokenizer()
        tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        data_train = BERTDataset(dataset_train, 0, 1, tok, config.max_len,
                                 True, False)
        data_test = BERTDataset(dataset_test, 0, 1, tok, config.max_len, True,
                                False)
        data_kaggle = BERTDataset(dataset_kaggle,
                                  0,
                                  1,
                                  tok,
                                  config.max_len,
                                  True,
                                  False,
                                  kaggle=True)

        self.train_dataloader = torch.utils.data.DataLoader(
            data_train, batch_size=config.batch_size, num_workers=5)
        self.test_dataloader = torch.utils.data.DataLoader(
            data_test, batch_size=config.batch_size, num_workers=5)
        self.kaggle_dataloader = torch.utils.data.DataLoader(data_kaggle,
                                                             batch_size=1,
                                                             num_workers=5)
        if use_all:
            dataset_all = nlp.data.TSVDataset(config.all_path,
                                              field_indices=[1, 2],
                                              num_discard_samples=1)
            data_all = BERTDataset(dataset_all, 0, 1, tok, config.max_len,
                                   True, False)
            self.all_dataloader = torch.utils.data.DataLoader(
                data_all,
                batch_size=config.batch_size,
                num_workers=5,
                shuffle=True)
 def __init__(self, tokenizer_s='spacy'):
     """
     bert-multi, kbalbert : [PAD], [CLS], ...
     :param tokenizer: string to represent tokenizer like 'spacy', 'bert', ...
     Example::
     
     nlp = English()
     tokenizer = nlp.Defaults.create_tokenizer(nlp)      
     tokenizer = Tokenizer(tokenizer)
     """
     if type(tokenizer_s) is str:
         self.tokenizer_s = tokenizer_s
     if tokenizer_s == 'spacy':
         self.nlp = spacy.load(
             "en_core_web_md")  # md, large have embed vectors
         self.tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp)
     elif tokenizer_s == 'bert-multi':
         from transformers import BertTokenizer, BertModel, BertConfig
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-multilingual-cased')
         self.vocab = self.tokenizer.vocab
     elif tokenizer_s == 'sktkobert':
         import gluonnlp as nlp
         from kobert.utils import get_tokenizer
         from kobert.pytorch_kobert import get_pytorch_kobert_model
         kobert, vocab = get_pytorch_kobert_model()
         self.tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(),
                                                   vocab,
                                                   lower=False)
         self.vocab = vocab
     elif tokenizer_s == 'kbalbert':
         import sys
         sys.path.append(
             '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/')
         from transformers import AlbertModel, TFAlbertModel
         from tokenization_kbalbert import KbAlbertCharTokenizer
         model_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model'
         self.tokenizer = KbAlbertCharTokenizer.from_pretrained(model_path)
         self.vocab = self.tokenizer.vocab
     else:
         if type(tokenizer_s) is str:
             from transformers import BertTokenizer, BertModel, BertConfig
             self.tokenizer = BertTokenizer.from_pretrained(tokenizer_s)
             self.vocab = self.tokenizer.vocab
         elif type(tokenizer_s) is not str:
             self.tokenizer = tokenizer_s
             self.tokenizer_s = 'custom'
         else:
             raise Exception('check tokenizer is correctly defined')
     self.pre_trained = self.tokenizer_s
Beispiel #20
0
def bert_test(opt):

    device = torch.device('cuda:{}'.format(opt.device))
    model = torch.load(opt.weights)
    model.to(device)
    # model = nn.DataParallel(model, output_device=[0,1])
    bertmodel, vocab = get_pytorch_kobert_model()
    model.eval()  # 평가 모드로 변경

    def calc_accuracy(X, Y):
        max_vals, max_indices = torch.max(X, 1)
        train_acc = (max_indices
                     == Y).sum().data.cpu().numpy() / max_indices.size()[0]
        return train_acc

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    max_len = 256  # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
    batch_size = opt.batch
    warmup_ratio = 0.1
    num_epochs = 2
    max_grad_norm = 1
    log_interval = 200
    learning_rate = 5e-5
    dataset_test = nlp.data.TSVDataset(opt.source,
                                       field_indices=[1, 2],
                                       num_discard_samples=1)
    data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(data_test,
                                                  batch_size=batch_size,
                                                  num_workers=5)
    test_acc = 0.0
    df = pd.DataFrame(columns=['pred', 'label'])
    pred = np.array([])
    # answer = np.array([])
    for batch_id, (token_ids, valid_length, segment_ids,
                   label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        # label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        _, max_idx = torch.max(out, 1)
        pred = np.append(pred, max_idx.cpu().detach().tolist())
        # answer = np.append(answer,label.cpu().detach().tolist())
        # test_acc += calc_accuracy(out, label)
        # print(len(pred))
    df['pred'] = pred
    # df['label'] = answer
    df.to_csv(opt.save_csv_name, index=False)
    def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir
Beispiel #22
0
    def __init__(self,
                 samples,
                 vocab,
                 media_map,
                 word_dropout_prob=0.0,
                 max_word_dropout_ratio=0.0,
                 max_token_cnt=300):
        self.tokenizer = SentencepieceTokenizer(get_tokenizer())
        self.vocab = vocab

        self.samples = samples
        self.targets = [s[1] for s in samples]
        self.media_map = media_map
        self.word_dropout_prob = word_dropout_prob
        self.max_word_dropout_ratio = max_word_dropout_ratio
        self.max_token_cnt = max_token_cnt
def BERT_inference(text):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    _, vocab = get_pytorch_kobert_model(device)

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    max_len = 80
    batch_size = 64
    warmup_ratio = 0.1
    num_epochs = 10
    max_grad_norm = 1
    log_interval = 30
    learning_rate = 5e-5

    # 1. 로드방법 : 학습한 파라미터만 Load하는 방법
    #new_save_path = 'v3_model_only_parameter_0302.pt'
    #model = BERTClassifier(bertmodel, dr_rate=0.1)
    #model.load_state_dict(new_save_path)
    #model.eval()

    # 2. 로드방법 : 모델 전체 저장한것 Load
    save_path = 'v2_model_0302.pt'
    model = torch.load(save_path)
    model.eval()

    infer_data = BERTDataset_infer(text, 0, tok, max_len, True, False)
    infer_data = torch.tensor(next(iter(infer_data))[0]).reshape(1, -1)

    segments_tensors = torch.zeros(len(infer_data[0]))
    segments_tensors = segments_tensors.reshape(1, -1)

    valid_length = torch.tensor(len(infer_data[0]))
    valid_length = valid_length.reshape(1, -1)

    infer_data = infer_data.long().to(device)
    segments_tensors = segments_tensors.long().to(device)
    valid_length = valid_length.long().to(device)

    with torch.no_grad():
        outputs = model(infer_data, valid_length, segments_tensors)

    print("딥러닝 최종 inference : ", torch.argmax(outputs[0]))

    return torch.argmax(outputs[0])
Beispiel #24
0
def data_loader(dtls, max_len, batch_size, num_workers):
    dataset_train, dataset_test = train_test_split(dtls,
                                                   test_size=0.2,
                                                   random_state=123)

    tokenizer = get_tokenizer()
    _, vocab = get_pytorch_kobert_model()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
    data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

    train_dataloader = torch.utils.data.DataLoader(data_train,
                                                   batch_size=batch_size,
                                                   num_workers=num_workers)
    test_dataloader = torch.utils.data.DataLoader(data_test,
                                                  batch_size=batch_size,
                                                  num_workers=num_workers)

    return train_dataloader, test_dataloader
def calc_vars(df):
    df['jamo_levenshtein'] = df.apply(
        lambda row: jamo_levenshtein(row['original'], row['corrected']),
        axis=1)

    tok_path = get_tokenizer()
    sp = SentencepieceTokenizer(tok_path)
    df['0_tokens'] = df['original'].apply(lambda x: len(sp(x)))
    df['1_tokens'] = df['corrected'].apply(lambda x: len(sp(x)))
    df['1_token/0_token'] = df['1_tokens'] / df['0_tokens']
    df['min_tokens'] = df[['0_tokens', '1_tokens']].min(axis=1)

    df = df[df['min_tokens'] > 0]
    df['log_tokens'] = df['min_tokens'].apply(lambda x: math.log(x, 20))
    df['ratio'] = df['jamo_levenshtein'] / df['min_tokens'] * df['log_tokens']

    df['0_len'] = df['original'].apply(lambda x: len(x))
    df['1_len'] = df['corrected'].apply(lambda x: len(x))
    df['len_ratio'] = df['1_len'] / df['0_len']

    return df
Beispiel #26
0
def convert_input_data(sentences):
    test_data = [sentences]
    print(test_data)
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    max_len = 128

    test_data = BERTDataset(test_data, 0, tok, max_len, True, False)
    dataloader = torch.utils.data.DataLoader(test_data,
                                             batch_size=1,
                                             num_workers=1)

    for token_ids, valid_length, segment_ids in dataloader:
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        result = model(token_ids, valid_length, segment_ids)

# return "ge"
    return result
Beispiel #27
0
    def __init__(self, path: str, max_seqlen: int = 512, ignore_index=-100) -> None:
        super(TokenizedDataset, self).__init__()
        with open(path, "rb") as f:
            self.data = pickle.load(f)
        self.max_len = max_seqlen

        _, self.vocab = get_pytorch_kobert_model()
        tok = get_tokenizer()
        self.tokenizer = nlp.data.BERTSPTokenizer(tok, self.vocab, lower=False)

        if "train" in path:
            self.data["token"] = self.data["token"][:100000]
            self.data["tgt"] = self.data["tgt"][:100000]

        self.tokens = self.data["token"]
        self.labels = self.data["tgt"]

        self.cls_idx = self.vocab["[CLS]"]
        self.pad_idx = self.vocab["[PAD]"]
        self.sep_idx = self.vocab["[SEP]"]
        self.mask_idx = self.vocab["[MASK]"]
        self.ignore_idx = ignore_index
Beispiel #28
0
def get_dataloaders(dataset_train,
                    dataset_test,
                    vocab,
                    batch_size,
                    max_len=64,
                    class_labels=['0', '1']):

    tokenizer = get_tokenizer()
    bert_tokenizer = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    # for single sentence classification, set pair=False
    # for regression task, set class_labels=None
    # for inference without label available, set has_label=False
    transform = BERTDatasetTransform(bert_tokenizer,
                                     max_len,
                                     class_labels=class_labels,
                                     has_label=True,
                                     pad=True,
                                     pair=False)
    data_train = dataset_train.transform(transform)
    data_test = dataset_test.transform(transform)

    train_sampler = nlp.data.FixedBucketSampler(
        lengths=[int(item[2]) for item in data_train],
        batch_size=batch_size,
        shuffle=True)
    train_dataloader = gluon.data.DataLoader(data_train,
                                             batch_sampler=train_sampler)

    test_sampler = nlp.data.FixedBucketSampler(
        lengths=[int(item[2]) for item in data_test],
        batch_size=batch_size,
        shuffle=True)
    test_dataloader = mx.gluon.data.DataLoader(data_test,
                                               batch_sampler=test_sampler)

    return train_dataloader, test_dataloader, bert_tokenizer
Beispiel #29
0
def main(args):
    root = args.path
    mode = args.mode
    dset = load_data(root, mode)

    _, vocab = get_pytorch_kobert_model()
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    tokenized_dset = []
    start_time = time.time()
    for d in tqdm(dset):
        tokenized_dset.append(tokenize(d, tok))
    print("--- %s seconds for tokenizing ---" % (time.time() - start_time))

    start_time = time.time()
    result = {"token": [], "tgt": []}
    for idx, data in tqdm(enumerate(tokenized_dset)):
        src = " ".join([" ".join(d) for d in data["tokenized_src"]]).split(" ")
        tgt = " ".join([" ".join(d) for d in data["tokenized_abs"]]).split(" ")

        auxiliary_tgt = make_aux_tgt(src, tgt)

        assert len(
            src) == len(auxiliary_tgt
                        ), f"Length mismatch: {len(src)}, {len(auxiliary_tgt)}"

        result["token"].append(src)
        result["tgt"].append(auxiliary_tgt)

    print("--- %s seconds for generating labels ---" %
          (time.time() - start_time))

    with open(f"{args.save_path}/contentselection_{mode}.pickle", "wb") as f:
        pickle.dump(result, f)
    print("--- Finished ---")
Beispiel #30
0
def main(parser):
    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)

    # data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    _, vocab_of_gluonnlp = get_pytorch_kobert_model()
    token_to_idx = vocab_of_gluonnlp.token_to_idx

    model_config.vocab_size = len(token_to_idx)
    vocab = Vocabulary(token_to_idx=token_to_idx)

    print("len(token_to_idx): ", len(token_to_idx))
    with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f:
        json.dump(token_to_idx, f, ensure_ascii=False, indent=4)

    # save vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'wb') as f:
        pickle.dump(vocab, f)

    # load vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
    ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir)

    # Train & Val Datasets
    cwd = Path.cwd()
    data_in = cwd / "data_in"
    train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명"
    tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir)
    tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn)
    tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False)

    # Model
    model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index))
    model.train()

    # optim
    train_examples_len = len(tr_clf_ds)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs
    t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    n_gpu = torch.cuda.device_count()
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    model.to(device)

    # save
    tb_writer = SummaryWriter('{}/runs'.format(model_dir))
    checkpoint_manager = CheckpointManager(model_dir)
    summary_manager = SummaryManager(model_dir)
    best_val_loss = 1e+10
    best_train_acc = 0

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(tr_clf_ds))
    logger.info("  Num Epochs = %d", model_config.epochs)
    logger.info("  Instantaneous batch size per GPU = %d", model_config.batch_size)
    # logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
    #                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
    best_steps = 0
    model.zero_grad()
    set_seed()  # Added here for reproductibility (even between python 2 and 3)

    # Train
    train_iterator = trange(int(model_config.epochs), desc="Epoch")
    for _epoch, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0]
        epoch = _epoch
        for step, batch in enumerate(epoch_iterator):
            model.train()
            x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch)
            log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real)

            # loss: negative log-likelihood
            loss = -1 * log_likelihood

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if model_config.gradient_accumulation_steps > 1:
                loss = loss / model_config.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm)
            tr_loss += loss.item()

            if (step + 1) % model_config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                with torch.no_grad():
                    sequence_of_tags = torch.tensor(sequence_of_tags)
                    print("sequence_of_tags: ", sequence_of_tags)
                    print("y_real: ", y_real)
                    print("loss: ", loss)
                    print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real))

                    mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean()

                tr_acc = mb_acc.item()
                tr_loss_avg = tr_loss / global_step
                tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}

                # if step % 50 == 0:
                print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step,
                                                                                             tr_summary['loss'],
                                                                                             tr_summary['acc']))

                if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0:
                    # Log metrics
                    if model_config.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        pass
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step)
                    logger.info("Average loss: %s at global step: %s",
                                str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step))
                    logging_loss = tr_loss

                if model_config.save_steps > 0 and global_step % model_config.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

                    state = {'global_step': global_step + 1,
                             'model_state_dict': model.state_dict(),
                             'opt_state_dict': optimizer.state_dict()}
                    summary = {'train': tr_summary}
                    summary_manager.update(summary)
                    summary_manager.save('summary.json')

                    is_best = tr_acc >= best_train_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)
                    # Save
                    if is_best:
                        best_train_acc = tr_acc
                        checkpoint_manager.save_checkpoint(state,
                                                           'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                         global_step,
                                                                                                         tr_acc))
                    else:
                        torch.save(state, os.path.join(output_dir,
                                                       'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                      global_step,
                                                                                                      tr_acc)))

    tb_writer.close()
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

    return global_step, tr_loss / global_step, best_steps