Example #1
0
    def __init__(self):

        self.device = torch.device("cuda:0")
        self.bertmodel, self.vocab = get_pytorch_kobert_model()

        bertmodel, vocab = get_pytorch_kobert_model()

        # 토큰화
        tokenizer = get_tokenizer()
        self.tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        self.max_len = 64
        self.batch_size = 64
Example #2
0
def predict(model, text):
    device = torch.device("cuda:0")
    max_len = 64
    batch_size = 64
    warmup_ratio = 0.1
    num_epochs = 2
    max_grad_norm = 1
    log_interval = 200 
    learning_rate =  5e-5
    
    tokenizer = get_tokenizer()
    bertmodel, vocab = get_pytorch_kobert_model()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    data_test = BERTDataset(text, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)
    model.eval()
    
    answer=[]
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        max_vals, max_indices = torch.max(out, 1)
        answer.append(max_indices.cpu().clone().numpy())
    
    result = F.softmax(out)
    
    print(result)
    return result
def get_kobert_model_and_tokenizer():
    tok_path = get_tokenizer()
    basic_tokenizer = SentencepieceTokenizer(tok_path)
    bert_base, vocab = get_pytorch_kobert_model()
    kobert_tokenizer = KoBertTokenizer(basic_tokenizer, vocab)

    return bert_base, kobert_tokenizer
def main():
    nsmc_home_dir = 'NSMC_DIR'
    train_file = nsmc_home_dir + '/ratings_train.txt'  # 150K
    test_file = nsmc_home_dir + '/ratings_test.txt'  # 50K

    model, vocab = get_pytorch_kobert_model(
        ctx='cuda' if torch.cuda.is_available() else 'cpu')

    lr = 5e-5
    batch_size = 16
    epochs = 5
    dropout_rate = 0.1
    max_grad_norm = 1.0
    num_total_steps = math.ceil(150000 / batch_size) * epochs
    num_warmup_steps = num_total_steps // 10
    log_interval = 100
    seed = 2019
    num_workers = 2
    num_classes = 2
    pooler_out_dim = model.pooler.dense.out_features

    torch.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print('device', device)

    tok_path = get_tokenizer()
    sp = SentencepieceTokenizer(tok_path)

    train_loader = torch.utils.data.DataLoader(MovieDataset(
        get_data(train_file, vocab, sp)),
                                               shuffle=True,
                                               batch_size=batch_size,
                                               num_workers=num_workers,
                                               collate_fn=batchify,
                                               pin_memory=True)

    test_loader = torch.utils.data.DataLoader(MovieDataset(
        get_data(test_file, vocab, sp)),
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              collate_fn=batchify,
                                              pin_memory=True)

    linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device)

    all_params = list(model.parameters()) + list(linear.parameters())
    optimizer = AdamW(all_params, lr=lr, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_total_steps)
    for epoch in range(epochs):
        train(train_loader, device, model, linear, all_params, optimizer,
              scheduler, dropout_rate, max_grad_norm, log_interval, epoch)
        print(datetime.now(), 'Testing...')
        test(test_loader, device, model, linear)
Example #5
0
def load_model(file):
    device = torch.device("cuda:0")
    bertmodel, vocab = get_pytorch_kobert_model()
    model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
    model.load_state_dict(torch.load(file))
    model.eval()
    
    return model
Example #6
0
def submit(args):
    bert_model, vocab = get_pytorch_kobert_model()
    test_dataset = SentenceDataset(args.test_file,
                                   vocab,
                                   max_token_cnt=args.max_token_cnt)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=1,
                                              num_workers=args.num_workers,
                                              shuffle=False)

    model = ExtractiveModel(bert_model,
                            100,
                            11,
                            768,
                            use_bert_sum_words=args.use_bert_sum_words,
                            use_pos=args.use_pos,
                            use_media=args.use_media,
                            num_classes=2,
                            simple_model=args.simple_model,
                            dim_feedforward=args.dim_feedforward,
                            dropout=args.dropout)

    if args.checkpoint_path is not None and os.path.isfile(
            args.checkpoint_path):
        state_dict = torch.load(args.checkpoint_path)[0]
        model.load_state_dict(state_dict)

    model.eval()  # Set model to evaluate mode
    device = 'cuda'
    model.to(device)

    ids = []
    summaries = []
    for step, (token_ids_batch, pos_idx_batch,
               media_batch) in enumerate(test_loader):
        if step % 10 == 0:
            print(step, len(test_loader))
        token_ids_batch = token_ids_batch[0].to(device)
        pos_idx_batch = pos_idx_batch[0].to(device)
        media_batch = media_batch[0].to(device)
        sentences, _, id = test_dataset.samples[step]
        ids.append(id)
        sentences = np.array(sentences)
        with torch.set_grad_enabled(False):
            outputs = model(token_ids_batch, pos_idx_batch, media_batch)
            indices = torch.argsort(outputs[:, 0], dim=0)
            sentences = sentences[indices[:3].cpu().numpy()]
            summaries.append("\n".join(sentences))

    os.makedirs(args.output_dir, exist_ok=True)
    rows = zip(ids, summaries)
    with open(os.path.join(args.output_dir, "submission.csv"), "w+") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "summary"])
        for row in rows:
            writer.writerow(row)
    print("done")
Example #7
0
def get_sentimentLabel(input_text, time_info):
    try:
        print("2. predict sentiment label")
        device = torch.device("cpu")
        bertmodel, vocab = get_pytorch_kobert_model()
        tokenizer = get_tokenizer()
        tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
        weights = torch.load('weight/bert_weight.pth',
                             map_location=torch.device('cpu'))
        model.load_state_dict(weights)
        model = model.to(device)
        model.eval()

        essay = pd.DataFrame(input_text)
        essay['label'] = 1
        save_link = "Data/{}.txt".format(time_info)
        essay.to_csv(save_link, sep='\t', index_label='idx')
        dataset_sentences = nlp.data.TSVDataset(save_link,
                                                field_indices=[1, 2],
                                                num_discard_samples=1)
        data_sentences = BERTDataset(dataset_sentences, 0, 1, tok, 100, True,
                                     False)  # max_len (100)
        sentences_dataloader = torch.utils.data.DataLoader(
            data_sentences, batch_size=len(data_sentences), num_workers=5)

        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids,
                           label) in enumerate(sentences_dataloader):
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                label = label.long().to(device)
                valid_length = valid_length
                outputs = model(token_ids, valid_length, segment_ids)
        pred_test = outputs
        arr = np.array(pred_test.tolist())
        arr = ne.evaluate("exp(arr)")

        label_dic = dict([(0, 'anger'), (1, 'fear'), (2, 'happiness'),
                          (3, 'miss'), (4, 'sadness'), (5, 'surprised'),
                          (6, 'worry')])
        for i in range(7):
            essay[label_dic[i]] = [proba[i] for proba in arr]
        essay['label'] = list(map(np.argmax, arr))
        indices = np.array(list(map(
            np.max, arr))).argsort()[::-1][0:min(len(essay), 10)]
        prob = essay.iloc[indices].sum(axis=0)[2:].astype(float)
        prob['happiness'] *= 0.6
        prob['fear'] *= 0.8
        prob['worry'] *= 2
        result = prob.idxmax()
        if result == 'fear':
            result = 'sadness'
        return result
    except:
        raise Sentiment_Error()
Example #8
0
    def __init__(self, temp_dir, load_pretrained_bert, bert_config):
        super(Bert, self).__init__()
        bertmodel, vocab = get_pytorch_kobert_model()
        if (load_pretrained_bert):
            # self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
            self.model = bertmodel

        else:
            self.model = bertmodel
Example #9
0
    def load_model(self):
        self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device)
        self.model = BERTClassifier(self.bert_model, dr_rate=self.dropout_rt).to(self.device)

        self.model.load_state_dict(torch.load(self.save_path, map_location=self.device))

        self.tokenizer = get_tokenizer()
        self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer, self.vocab, lower=False)

        self.line_converter = Converter(self.token, self.max_len, self.pad, self.pair, self.device)
Example #10
0
    def __init__(self, config, num_classes, vocab=None) -> None:
        super(KobertCRF, self).__init__()

        if vocab is None:
            self.bert, self.vocab = get_pytorch_kobert_model()
        else:
            self.bert = BertModel(config=BertConfig.from_dict(bert_config))
            self.vocab = vocab

        self.dropout = nn.Dropout(config.dropout)
        self.position_wise_ff = nn.Linear(config.hidden_size, num_classes)
        self.crf = CRF(num_tags=num_classes, batch_first=True)
Example #11
0
    def load_model(self):
        self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device)
        self.model = BERTClassifier(self.bert_model,
                                    dr_rate=self.dropout_rt).to(self.device)
        if self.get_weights:
            print("get model from pretrained weigths")
            self.model.load_state_dict(
                torch.load(self.model_save_path, map_location=self.device))

        self.tokenizer = get_tokenizer()
        self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer,
                                                   self.vocab,
                                                   lower=False)
Example #12
0
    def __init__(self, large, temp_dir, finetune=False):
        super(Bert, self).__init__()
        if (large):
            self.model = BertModel.from_pretrained('bert-large-uncased',
                                                   cache_dir=temp_dir)
        else:
            #self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
            vocab = get_kobert_vocab(temp_dir)
            self.model, _ = get_pytorch_kobert_model(cachedir=temp_dir)
            # add [BOS], [EOS]
            self.model.resize_token_embeddings(len(vocab))

        self.finetune = finetune
Example #13
0
    def __init__(self, train_path, test_path, kaggle_path, use_all):
        device = torch.device("cuda:0")

        bertmodel, vocab = get_pytorch_kobert_model()
        self.model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

        dataset_train = nlp.data.TSVDataset(train_path,
                                            field_indices=[1, 2],
                                            num_discard_samples=1)
        dataset_test = nlp.data.TSVDataset(test_path,
                                           field_indices=[1, 2],
                                           num_discard_samples=1)
        dataset_kaggle = nlp.data.TSVDataset(kaggle_path,
                                             field_indices=[1],
                                             num_discard_samples=1,
                                             encoding='cp949')

        tokenizer = get_tokenizer()
        tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        data_train = BERTDataset(dataset_train, 0, 1, tok, config.max_len,
                                 True, False)
        data_test = BERTDataset(dataset_test, 0, 1, tok, config.max_len, True,
                                False)
        data_kaggle = BERTDataset(dataset_kaggle,
                                  0,
                                  1,
                                  tok,
                                  config.max_len,
                                  True,
                                  False,
                                  kaggle=True)

        self.train_dataloader = torch.utils.data.DataLoader(
            data_train, batch_size=config.batch_size, num_workers=5)
        self.test_dataloader = torch.utils.data.DataLoader(
            data_test, batch_size=config.batch_size, num_workers=5)
        self.kaggle_dataloader = torch.utils.data.DataLoader(data_kaggle,
                                                             batch_size=1,
                                                             num_workers=5)
        if use_all:
            dataset_all = nlp.data.TSVDataset(config.all_path,
                                              field_indices=[1, 2],
                                              num_discard_samples=1)
            data_all = BERTDataset(dataset_all, 0, 1, tok, config.max_len,
                                   True, False)
            self.all_dataloader = torch.utils.data.DataLoader(
                data_all,
                batch_size=config.batch_size,
                num_workers=5,
                shuffle=True)
Example #14
0
    def __init__(self,
                 hidden_size=768,
                 num_classes=4,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.dr_rate = dr_rate
        self.device = torch.device(
            "cuda:0") if torch.cuda.is_available() else torch.device("cpu")
        self.bertmodel, _ = get_pytorch_kobert_model()

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
Example #15
0
 def __init__(self,
              bert,
              hidden_size=768,
              num_classes=2,
              dr_rate=None,
              params=None):
     super(RNNClassifier, self).__init__()
     _, vocab = get_pytorch_kobert_model()
     self.dr_rate = dr_rate
     self.embedding = nn.Embedding(len(vocab.token_to_idx), 100)
     self.rnn = nn.RNN(100, hidden_size, batch_first=True)
     self.classifier = nn.Linear(hidden_size, num_classes)
     if dr_rate:
         self.dropout = nn.Dropout(p=dr_rate)
Example #16
0
 def __init__(self, tokenizer_s='spacy'):
     """
     bert-multi, kbalbert : [PAD], [CLS], ...
     :param tokenizer: string to represent tokenizer like 'spacy', 'bert', ...
     Example::
     
     nlp = English()
     tokenizer = nlp.Defaults.create_tokenizer(nlp)      
     tokenizer = Tokenizer(tokenizer)
     """
     if type(tokenizer_s) is str:
         self.tokenizer_s = tokenizer_s
     if tokenizer_s == 'spacy':
         self.nlp = spacy.load(
             "en_core_web_md")  # md, large have embed vectors
         self.tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp)
     elif tokenizer_s == 'bert-multi':
         from transformers import BertTokenizer, BertModel, BertConfig
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-multilingual-cased')
         self.vocab = self.tokenizer.vocab
     elif tokenizer_s == 'sktkobert':
         import gluonnlp as nlp
         from kobert.utils import get_tokenizer
         from kobert.pytorch_kobert import get_pytorch_kobert_model
         kobert, vocab = get_pytorch_kobert_model()
         self.tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(),
                                                   vocab,
                                                   lower=False)
         self.vocab = vocab
     elif tokenizer_s == 'kbalbert':
         import sys
         sys.path.append(
             '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/')
         from transformers import AlbertModel, TFAlbertModel
         from tokenization_kbalbert import KbAlbertCharTokenizer
         model_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model'
         self.tokenizer = KbAlbertCharTokenizer.from_pretrained(model_path)
         self.vocab = self.tokenizer.vocab
     else:
         if type(tokenizer_s) is str:
             from transformers import BertTokenizer, BertModel, BertConfig
             self.tokenizer = BertTokenizer.from_pretrained(tokenizer_s)
             self.vocab = self.tokenizer.vocab
         elif type(tokenizer_s) is not str:
             self.tokenizer = tokenizer_s
             self.tokenizer_s = 'custom'
         else:
             raise Exception('check tokenizer is correctly defined')
     self.pre_trained = self.tokenizer_s
Example #17
0
def bert_test(opt):

    device = torch.device('cuda:{}'.format(opt.device))
    model = torch.load(opt.weights)
    model.to(device)
    # model = nn.DataParallel(model, output_device=[0,1])
    bertmodel, vocab = get_pytorch_kobert_model()
    model.eval()  # 평가 모드로 변경

    def calc_accuracy(X, Y):
        max_vals, max_indices = torch.max(X, 1)
        train_acc = (max_indices
                     == Y).sum().data.cpu().numpy() / max_indices.size()[0]
        return train_acc

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    max_len = 256  # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
    batch_size = opt.batch
    warmup_ratio = 0.1
    num_epochs = 2
    max_grad_norm = 1
    log_interval = 200
    learning_rate = 5e-5
    dataset_test = nlp.data.TSVDataset(opt.source,
                                       field_indices=[1, 2],
                                       num_discard_samples=1)
    data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(data_test,
                                                  batch_size=batch_size,
                                                  num_workers=5)
    test_acc = 0.0
    df = pd.DataFrame(columns=['pred', 'label'])
    pred = np.array([])
    # answer = np.array([])
    for batch_id, (token_ids, valid_length, segment_ids,
                   label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        # label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        _, max_idx = torch.max(out, 1)
        pred = np.append(pred, max_idx.cpu().detach().tolist())
        # answer = np.append(answer,label.cpu().detach().tolist())
        # test_acc += calc_accuracy(out, label)
        # print(len(pred))
    df['pred'] = pred
    # df['label'] = answer
    df.to_csv(opt.save_csv_name, index=False)
Example #18
0
    def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir
Example #19
0
    def __init__(self, config, num_classes, vocab=None) -> None:
        super(KobertBiLSTMCRF, self).__init__()

        if vocab is None:  # pretraining model 사용
            self.bert, self.vocab = get_pytorch_kobert_model()
        else:  # finetuning model 사용
            self.bert = BertModel(config=BertConfig.from_dict(bert_config))
            self.vocab = vocab
        self._pad_id = self.vocab.token_to_idx[self.vocab.padding_token]

        self.dropout = nn.Dropout(config.dropout)
        self.bilstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2,
                              dropout=config.dropout,
                              batch_first=True,
                              bidirectional=True)
        self.position_wise_ff = nn.Linear(config.hidden_size, num_classes)
        self.crf = CRF(num_tags=num_classes, batch_first=True)
def BERT_inference(text):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    _, vocab = get_pytorch_kobert_model(device)

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    max_len = 80
    batch_size = 64
    warmup_ratio = 0.1
    num_epochs = 10
    max_grad_norm = 1
    log_interval = 30
    learning_rate = 5e-5

    # 1. 로드방법 : 학습한 파라미터만 Load하는 방법
    #new_save_path = 'v3_model_only_parameter_0302.pt'
    #model = BERTClassifier(bertmodel, dr_rate=0.1)
    #model.load_state_dict(new_save_path)
    #model.eval()

    # 2. 로드방법 : 모델 전체 저장한것 Load
    save_path = 'v2_model_0302.pt'
    model = torch.load(save_path)
    model.eval()

    infer_data = BERTDataset_infer(text, 0, tok, max_len, True, False)
    infer_data = torch.tensor(next(iter(infer_data))[0]).reshape(1, -1)

    segments_tensors = torch.zeros(len(infer_data[0]))
    segments_tensors = segments_tensors.reshape(1, -1)

    valid_length = torch.tensor(len(infer_data[0]))
    valid_length = valid_length.reshape(1, -1)

    infer_data = infer_data.long().to(device)
    segments_tensors = segments_tensors.long().to(device)
    valid_length = valid_length.long().to(device)

    with torch.no_grad():
        outputs = model(infer_data, valid_length, segments_tensors)

    print("딥러닝 최종 inference : ", torch.argmax(outputs[0]))

    return torch.argmax(outputs[0])
Example #21
0
    def __init__(self, vectorizer=None, tokenizer=None, dim_embed=200):
        """
        :param tokenizer: KB 
        """
        self.vectorizer = vectorizer
        self.tokenizer = tokenizer
        self.pre_trained = pre_trained = vectorizer.pre_trained
        self.n_tag = self.vectorizer.n_tag

        if 'bert' in pre_trained.lower():
            self.tag2vec = None
            import sys
            if pre_trained == 'bert-multi':
                from transformers import BertModel, BertConfig
                bert_config = BertConfig.from_pretrained(
                    'bert-base-multilingual-cased', output_hidden_states=True)
                self.bert = BertModel(bert_config).to(device)
            elif pre_trained == 'sktkobert':
                from kobert.pytorch_kobert import get_pytorch_kobert_model
                #sys.path.append('/home/bwlee/work/codes/sentence_similarity/kobert')
                #from pytorch_kobert3 import get_pytorch_kobert_model
                self.bert, _ = get_pytorch_kobert_model()
                self.bert = self.bert.to(device)
            elif pre_trained == 'kbalbert':
                sys.path.append(
                    '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/')
                from transformers import AlbertModel
                kbalbert_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model'
                self.bert = AlbertModel.from_pretrained(
                    kbalbert_path, output_hidden_states=True)
                self.bert = self.bert.to(device)
            else:
                from transformers import BertModel, BertConfig
                bert_config = BertConfig.from_pretrained(
                    pre_trained, output_hidden_states=True)
                self.bert = BertModel(bert_config).to(device)
        else:
            self.tag2vec = self.vectorizer.tag2vec
            self.n_vocab = len(self.vectorizer.tag2vec)
            if pre_trained == '':
                self.embed = nn.Embedding(num_embeddings=self.n_tag,
                                          embedding_dim=dim_embed,
                                          padding_idx=self.tag2ix[PAD_TAG])
Example #22
0
def make_model(N=6, d_model=768, d_ff=1024, h=8, dropout=0.1):
    # To copy Bert embedding layer, d_model should be the same for Generator.
    # Since d_model=768 in decoder is too big to train, d_ff is set from 3072 to 1024. (zzingae)
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)

    bert, vocab = get_pytorch_kobert_model()
    vocab_size = len(vocab)

    model = Chatbot(
        bert,
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, vocab_size), c(position)),
        nn.Sequential(Embeddings(d_model, vocab_size), c(position)),
        Generator(d_model, vocab_size))

    return model, vocab
Example #23
0
def get_loader(raw_data,
               max_len,
               batch_size=100,
               shuffle=False,
               user_map_dict=None,
               max_users=10):
    def collate_fn(data):
        return zip(*data)

    bertmodel, vocab = get_pytorch_kobert_model()
    tokenizer = kobert.utils.get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    dataset = CDMMBDataset(raw_data[0], raw_data[1], raw_data[2], tok, max_len,
                           True, False, user_map_dict, max_users)

    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=shuffle,
                             collate_fn=collate_fn)

    return data_loader
Example #24
0
    def __init__(self, path: str, max_seqlen: int = 512, ignore_index=-100) -> None:
        super(TokenizedDataset, self).__init__()
        with open(path, "rb") as f:
            self.data = pickle.load(f)
        self.max_len = max_seqlen

        _, self.vocab = get_pytorch_kobert_model()
        tok = get_tokenizer()
        self.tokenizer = nlp.data.BERTSPTokenizer(tok, self.vocab, lower=False)

        if "train" in path:
            self.data["token"] = self.data["token"][:100000]
            self.data["tgt"] = self.data["tgt"][:100000]

        self.tokens = self.data["token"]
        self.labels = self.data["tgt"]

        self.cls_idx = self.vocab["[CLS]"]
        self.pad_idx = self.vocab["[PAD]"]
        self.sep_idx = self.vocab["[SEP]"]
        self.mask_idx = self.vocab["[MASK]"]
        self.ignore_idx = ignore_index
Example #25
0
    def __init__(
        self,
        train_path: str = None,
        val_path: str = None,
        test_path: str = None,
        lr: float = None,
        warmup_percent: float = 0.1,
        train_batch_size: int = None,
        val_batch_size: int = None,
        num_classes: int = 2,
        num_workers: int = 2,
        gpus: int = 2,
        config: dict = bert_config,
    ) -> None:
        super(ContentSelector, self).__init__()
        self.save_hyperparameters()
        self.lr = self.hparams.lr
        self.lr_scale = 0
        self.loss = nn.CrossEntropyLoss()

        self.bert, self.vocab = get_pytorch_kobert_model()
        self.dropout = nn.Dropout(self.hparams.config["hidden_dropout_prob"])
        self.classifier = nn.Linear(self.hparams.config["hidden_size"],
                                    num_classes)
Example #26
0
def main(args):
    root = args.path
    mode = args.mode
    dset = load_data(root, mode)

    _, vocab = get_pytorch_kobert_model()
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    tokenized_dset = []
    start_time = time.time()
    for d in tqdm(dset):
        tokenized_dset.append(tokenize(d, tok))
    print("--- %s seconds for tokenizing ---" % (time.time() - start_time))

    start_time = time.time()
    result = {"token": [], "tgt": []}
    for idx, data in tqdm(enumerate(tokenized_dset)):
        src = " ".join([" ".join(d) for d in data["tokenized_src"]]).split(" ")
        tgt = " ".join([" ".join(d) for d in data["tokenized_abs"]]).split(" ")

        auxiliary_tgt = make_aux_tgt(src, tgt)

        assert len(
            src) == len(auxiliary_tgt
                        ), f"Length mismatch: {len(src)}, {len(auxiliary_tgt)}"

        result["token"].append(src)
        result["tgt"].append(auxiliary_tgt)

    print("--- %s seconds for generating labels ---" %
          (time.time() - start_time))

    with open(f"{args.save_path}/contentselection_{mode}.pickle", "wb") as f:
        pickle.dump(result, f)
    print("--- Finished ---")
Example #27
0
def main(parser):
    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)

    # data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    _, vocab_of_gluonnlp = get_pytorch_kobert_model()
    token_to_idx = vocab_of_gluonnlp.token_to_idx

    model_config.vocab_size = len(token_to_idx)
    vocab = Vocabulary(token_to_idx=token_to_idx)

    print("len(token_to_idx): ", len(token_to_idx))
    with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f:
        json.dump(token_to_idx, f, ensure_ascii=False, indent=4)

    # save vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'wb') as f:
        pickle.dump(vocab, f)

    # load vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
    ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir)

    # Train & Val Datasets
    cwd = Path.cwd()
    data_in = cwd / "data_in"
    train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명"
    tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir)
    tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn)
    tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False)

    # Model
    model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index))
    model.train()

    # optim
    train_examples_len = len(tr_clf_ds)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs
    t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    n_gpu = torch.cuda.device_count()
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    model.to(device)

    # save
    tb_writer = SummaryWriter('{}/runs'.format(model_dir))
    checkpoint_manager = CheckpointManager(model_dir)
    summary_manager = SummaryManager(model_dir)
    best_val_loss = 1e+10
    best_train_acc = 0

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(tr_clf_ds))
    logger.info("  Num Epochs = %d", model_config.epochs)
    logger.info("  Instantaneous batch size per GPU = %d", model_config.batch_size)
    # logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
    #                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
    best_steps = 0
    model.zero_grad()
    set_seed()  # Added here for reproductibility (even between python 2 and 3)

    # Train
    train_iterator = trange(int(model_config.epochs), desc="Epoch")
    for _epoch, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0]
        epoch = _epoch
        for step, batch in enumerate(epoch_iterator):
            model.train()
            x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch)
            log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real)

            # loss: negative log-likelihood
            loss = -1 * log_likelihood

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if model_config.gradient_accumulation_steps > 1:
                loss = loss / model_config.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm)
            tr_loss += loss.item()

            if (step + 1) % model_config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                with torch.no_grad():
                    sequence_of_tags = torch.tensor(sequence_of_tags)
                    print("sequence_of_tags: ", sequence_of_tags)
                    print("y_real: ", y_real)
                    print("loss: ", loss)
                    print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real))

                    mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean()

                tr_acc = mb_acc.item()
                tr_loss_avg = tr_loss / global_step
                tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}

                # if step % 50 == 0:
                print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step,
                                                                                             tr_summary['loss'],
                                                                                             tr_summary['acc']))

                if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0:
                    # Log metrics
                    if model_config.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        pass
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step)
                    logger.info("Average loss: %s at global step: %s",
                                str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step))
                    logging_loss = tr_loss

                if model_config.save_steps > 0 and global_step % model_config.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

                    state = {'global_step': global_step + 1,
                             'model_state_dict': model.state_dict(),
                             'opt_state_dict': optimizer.state_dict()}
                    summary = {'train': tr_summary}
                    summary_manager.update(summary)
                    summary_manager.save('summary.json')

                    is_best = tr_acc >= best_train_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)
                    # Save
                    if is_best:
                        best_train_acc = tr_acc
                        checkpoint_manager.save_checkpoint(state,
                                                           'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                         global_step,
                                                                                                         tr_acc))
                    else:
                        torch.save(state, os.path.join(output_dir,
                                                       'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                      global_step,
                                                                                                      tr_acc)))

    tb_writer.close()
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

    return global_step, tr_loss / global_step, best_steps
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

##GPU 사용 시
device = torch.device("cuda:0")

bertmodel, vocab = get_pytorch_kobert_model()

from google.colab import drive
drive.mount('/content/drive')

# 학습용 데이터셋 불러오기
import pandas as pd
dataset_train1 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/자연어처리/user_conversation.csv')
dataset_train1.head()

# 데이터 전처리
dataset_train1.drop(['Unnamed: 0', 'Unnamed: 0.1', '질문 제목', '작성 시간', '태그', 'url'], axis=1, inplace=True)
dataset_train1.head()

dataset_train1['질병명'].unique()
Example #29
0
def main(argv):
    if FLAGS.model == 'BERT':
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        model = BertClassificationModel(input_path=FLAGS.input_path,
                                        model='bert-base-multilingual-cased',
                                        tokenizer=tokenizer,
                                        batch_size=FLAGS.batch_size,
                                        num_workers=FLAGS.num_workers,
                                        lr=FLAGS.lr,
                                        weight_decay=FLAGS.weight_decay,
                                        warm_up=FLAGS.warm_up)
    elif FLAGS.model == 'KoBERT':
        bertmodel, vocab = get_pytorch_kobert_model()
        tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), vocab, lower=False)
        model = KoBertClassficationModel(input_path=FLAGS.input_path,
                                         model=bertmodel,
                                         tokenizer=tokenizer,
                                         batch_size=FLAGS.batch_size,
                                         num_workers=FLAGS.num_workers,
                                         lr=FLAGS.lr,
                                         weight_decay=FLAGS.weight_decay,
                                         warm_up=FLAGS.warm_up)
    elif FLAGS.model == 'KcBERT':
        tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-large')
        model = BertClassificationModel(input_path=FLAGS.input_path,
                                        model='beomi/kcbert-large',
                                        tokenizer=tokenizer,
                                        batch_size=FLAGS.batch_size,
                                        num_workers=FLAGS.num_workers,
                                        lr=FLAGS.lr,
                                        weight_decay=FLAGS.weight_decay,
                                        warm_up=FLAGS.warm_up)
    else:
        raise ValueError('Unknown model type')

    seed_everything(42)

    checkpoint_callback = ModelCheckpoint(
        filepath=FLAGS.save_dir,
        save_top_k=1,
        monitor='val_loss',
        mode='min'
    )

    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=2,
        strict=False,
        verbose=False,
        mode='min'
    )

    logger = TensorBoardLogger(
        save_dir=FLAGS.save_dir,
        name='logs_' + FLAGS.model,
        version=FLAGS.version
    )
    lr_monitor = LearningRateMonitor(logging_interval='step')

    if FLAGS.config_path is not None:
        parser = ConfigParser()
        parser.read(FLAGS.config_path)

    @telegram_sender(token=parser.get('telegram', 'token'),
                     chat_id=parser.get('telegram', 'chat_id'))
    def train_notify(trainer: Trainer = None,
                     model: Union[BertClassificationModel, KoBertClassficationModel] = None) -> None:
        trainer.fit(model)

    if FLAGS.cuda_device > 1:
        trainer = Trainer(deterministic=True,
                          gpus=FLAGS.cuda_device,
                          distributed_backend='ddp',
                          log_gpu_memory=True,
                          checkpoint_callback=checkpoint_callback,
                          early_stop_callback=early_stop,
                          max_epochs=FLAGS.max_epochs,
                          logger=logger,
                          callbacks=[lr_monitor])
        logging.info(f'There are {torch.cuda.device_count()} GPU(s) available.')
        logging.info(f'Use the number of GPU: {FLAGS.cuda_device}')
    elif FLAGS.cuda_device == 1:
        trainer = Trainer(deterministic=True,
                          gpus=FLAGS.cuda_device,
                          log_gpu_memory=True,
                          checkpoint_callback=checkpoint_callback,
                          early_stop_callback=early_stop,
                          max_epochs=FLAGS.max_epochs,
                          logger=logger,
                          callbacks=[lr_monitor])
        logging.info(f'There are {torch.cuda.device_count()} GPU(s) available.')
        logging.info(f'Use the number of GPU: {FLAGS.cuda_device}')
    else:
        trainer = Trainer(deterministic=True,
                          checkpoint_callback=checkpoint_callback,
                          early_stop_callback=early_stop,
                          max_epochs=FLAGS.max_epochs,
                          logger=logger,
                          callbacks=[lr_monitor])
        logging.info('No GPU available, using the CPU instead.')
    if FLAGS.config_path is not None:
        train_notify(trainer=trainer,
                     model=model)
    else:
        trainer.fit(model)
Example #30
0
def run_model():
    device = torch.device("cuda:0")
    bertmodel, vocab = get_pytorch_kobert_model()
    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
    
    max_len = 64
    batch_size = 64
    warmup_ratio = 0.1
    num_epochs = 2
    max_grad_norm = 1
    log_interval = 200
    learning_rate =  5e-5
    
    dataset_train, dataset_test = train_test_split(dtls, test_size=0.2, random_state=123)
    
    data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
    data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)
    

    model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
    
    #��Ƽ�������� �ս��Լ� ����
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()
    
    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    
    
    #�н� ����
    for e in range(num_epochs):
        train_acc = 0.0
        test_acc = 0.0
        model.train()
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device) 
            valid_length= valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            if batch_id % log_interval == 0:
                print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1))) 
        print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1))) 
        model.eval() #�� �� �κ� 
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length= valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            test_acc += calc_accuracy(out, label)
        print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
        
        
    return model