def __init__(self, test_file, vocab, max_token_cnt=300): self.tokenizer = SentencepieceTokenizer(get_tokenizer()) self.vocab = vocab self.max_token_cnt = max_token_cnt self.media_map = { '경기일보': 0, '광양신문': 1, '광주매일신문': 2, '광주일보': 3, '국제신문': 4, '기호일보': 5, '남도일보': 6, '당진시대': 7, '대구신문': 8, '대구일보': 9, '대전일보': 10 } print("medias", self.media_map) samples = [] with jsonlines.open(test_file) as f: for line in f.iter(): media = line['media'] id = line['id'] sentences = [] for i, sentence in enumerate(line['article_original']): sentences.append(sentence.replace('\n', '').strip()) samples.append([sentences, media, id]) self.samples = samples
def predict(model, text): device = torch.device("cuda:0") max_len = 64 batch_size = 64 warmup_ratio = 0.1 num_epochs = 2 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 tokenizer = get_tokenizer() bertmodel, vocab = get_pytorch_kobert_model() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) data_test = BERTDataset(text, 0, 1, tok, max_len, True, False) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0) model.eval() answer=[] for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length= valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) max_vals, max_indices = torch.max(out, 1) answer.append(max_indices.cpu().clone().numpy()) result = F.softmax(out) print(result) return result
def generate_subword_script(dataset_path, new_path, script_prefix, use_pretrain_kobert_tokenizer=False): print('create_subword_script...') if use_pretrain_kobert_tokenizer: tok_path = get_tokenizer() sp = SentencepieceTokenizer(tok_path) else: sp = spm.SentencePieceProcessor() vocab_file = "aihub_sentencepiece.model" sp.load(vocab_file) for folder in os.listdir(dataset_path): # folder : {KsponSpeech_01, ..., KsponSpeech_05} path = os.path.join(dataset_path, folder) for subfolder in os.listdir(path): path = os.path.join(dataset_path, folder, subfolder) for file in os.listdir(path): with open(os.path.join(path, file), "r", encoding='cp949') as f: sentence = f.read() if use_pretrain_kobert_tokenizer: encode = sp(sentence) else: encode = sp.encode_as_ids(sentence) with open(os.path.join(new_path, script_prefix + file[12:]), "w", encoding='cp949') as f: f.write(" ".join(map(str, encode)))
def __init__(self): self.tok_path = get_tokenizer() self.sp = SentencepieceTokenizer(self.tok_path) self.v_dimension = 300 self.v_window = 8 self.hangul = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣]+") self.mecab = Mecab()
def __init__(self, name): self.name = name self.token2index = {} self.index2token = {} self.n_tokens = 0 tok_path = get_tokenizer() self.sp = SentencepieceTokenizer(tok_path)
def get_kobert_model_and_tokenizer(): tok_path = get_tokenizer() basic_tokenizer = SentencepieceTokenizer(tok_path) bert_base, vocab = get_pytorch_kobert_model() kobert_tokenizer = KoBertTokenizer(basic_tokenizer, vocab) return bert_base, kobert_tokenizer
def chat(model_params, sent='0'): tok_path = get_tokenizer() model, vocab = get_mxnet_kobert_model(ctx=ctx) tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) kogptqa = KoGPT2Chat(model) kogptqa.load_parameters(model_params, ctx=ctx) sent_tokens = tok(sent) while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = mx.nd.array([vocab[U_TKN]] + vocab[q_tok] + vocab[EOS, SENT] + vocab[sent_tokens] + vocab[EOS, S_TKN] + vocab[a_tok]).expand_dims(axis=0) pred = kogptqa(input_ids.as_in_context(ctx)) gen = vocab.to_tokens( mx.nd.argmax( pred, axis=-1).squeeze().astype('int').asnumpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def __init__(self, input_path_or_input_list, output_path): # load file to process if isinstance(input_path_or_input_list, str): # if a path is given as path string self.file = open(input_path_or_input_list,'rt',encoding='utf8') else: # if a path is given as list self.file = input_path_or_input_list self.output_path = output_path self.is_filetype = lambda x: any([isinstance(x, io.TextIOBase), isinstance(x, io.BufferedIOBase), isinstance(x, io.RawIOBase), isinstance(x, io.IOBase)]) # tokenizer tok_path = get_tokenizer() self.tokenizer = SentencepieceTokenizer(tok_path) # rule set with open(config.post_process_rule_path,'rt',encoding='utf8') as f: self.rules = dict(map(lambda x:tuple(x.strip('\n').split('\t')),f)) #dict to store (x,y,y_pred) triplet self.idx_map = ['x','y','y_pred'] self.inst_dict = {} # numbers / hipen self.num_2_txt = {'^\(1\)':['우선,','먼저,','처음으로,'], '^\(2\)':['두 번째로,', '이어서,','다음으로,'], '^\(3\)':['세 번째로,','이어서,','다음으로,'], '^\(4\)':['네 번째로,','이어서,','다음으로,'], '^\(5\)':['다섯 번째로,','이어서,','다음으로,'], '^\(6\)':['여섯 번째로,','이어서,','다음으로,'] }
def __init__( self, data_path, ) -> None: with open(data_path, 'rb') as f: data = pickle.load(f) self.src_tokens = data["src_tokens"] self.tgt_tokens = data["tgt_tokens"] self.src_string = data["src_raw"] self.tgt_string = data["tgt_raw"] self.ext_labels = data["ext_labels"] self.vocab = get_kobert_vocab() self.tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), self.vocab, lower=False) self.pad_idx = self.vocab["[PAD]"] self.cls_idx = self.vocab["[CLS]"] self.sep_idx = self.vocab["[SEP]"] self.mask_idx = self.vocab["[MASK]"] self.bos_idx = self.vocab["[BOS]"] self.eos_idx = self.vocab["[EOS]"]
def main(): nsmc_home_dir = 'NSMC_DIR' train_file = nsmc_home_dir + '/ratings_train.txt' # 150K test_file = nsmc_home_dir + '/ratings_test.txt' # 50K model, vocab = get_pytorch_kobert_model( ctx='cuda' if torch.cuda.is_available() else 'cpu') lr = 5e-5 batch_size = 16 epochs = 5 dropout_rate = 0.1 max_grad_norm = 1.0 num_total_steps = math.ceil(150000 / batch_size) * epochs num_warmup_steps = num_total_steps // 10 log_interval = 100 seed = 2019 num_workers = 2 num_classes = 2 pooler_out_dim = model.pooler.dense.out_features torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('device', device) tok_path = get_tokenizer() sp = SentencepieceTokenizer(tok_path) train_loader = torch.utils.data.DataLoader(MovieDataset( get_data(train_file, vocab, sp)), shuffle=True, batch_size=batch_size, num_workers=num_workers, collate_fn=batchify, pin_memory=True) test_loader = torch.utils.data.DataLoader(MovieDataset( get_data(test_file, vocab, sp)), batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=batchify, pin_memory=True) linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device) all_params = list(model.parameters()) + list(linear.parameters()) optimizer = AdamW(all_params, lr=lr, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps) for epoch in range(epochs): train(train_loader, device, model, linear, all_params, optimizer, scheduler, dropout_rate, max_grad_norm, log_interval, epoch) print(datetime.now(), 'Testing...') test(test_loader, device, model, linear)
def get_sentimentLabel(input_text, time_info): try: print("2. predict sentiment label") device = torch.device("cpu") bertmodel, vocab = get_pytorch_kobert_model() tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) weights = torch.load('weight/bert_weight.pth', map_location=torch.device('cpu')) model.load_state_dict(weights) model = model.to(device) model.eval() essay = pd.DataFrame(input_text) essay['label'] = 1 save_link = "Data/{}.txt".format(time_info) essay.to_csv(save_link, sep='\t', index_label='idx') dataset_sentences = nlp.data.TSVDataset(save_link, field_indices=[1, 2], num_discard_samples=1) data_sentences = BERTDataset(dataset_sentences, 0, 1, tok, 100, True, False) # max_len (100) sentences_dataloader = torch.utils.data.DataLoader( data_sentences, batch_size=len(data_sentences), num_workers=5) with torch.no_grad(): for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(sentences_dataloader): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) label = label.long().to(device) valid_length = valid_length outputs = model(token_ids, valid_length, segment_ids) pred_test = outputs arr = np.array(pred_test.tolist()) arr = ne.evaluate("exp(arr)") label_dic = dict([(0, 'anger'), (1, 'fear'), (2, 'happiness'), (3, 'miss'), (4, 'sadness'), (5, 'surprised'), (6, 'worry')]) for i in range(7): essay[label_dic[i]] = [proba[i] for proba in arr] essay['label'] = list(map(np.argmax, arr)) indices = np.array(list(map( np.max, arr))).argsort()[::-1][0:min(len(essay), 10)] prob = essay.iloc[indices].sum(axis=0)[2:].astype(float) prob['happiness'] *= 0.6 prob['fear'] *= 0.8 prob['worry'] *= 2 result = prob.idxmax() if result == 'fear': result = 'sadness' return result except: raise Sentiment_Error()
def load_model(self): self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device) self.model = BERTClassifier(self.bert_model, dr_rate=self.dropout_rt).to(self.device) self.model.load_state_dict(torch.load(self.save_path, map_location=self.device)) self.tokenizer = get_tokenizer() self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer, self.vocab, lower=False) self.line_converter = Converter(self.token, self.max_len, self.pad, self.pair, self.device)
def main(args): for arg in vars(args): print(arg, getattr(args, arg)) tokenizer = SentencepieceTokenizer(get_tokenizer()) lines_len = 0 src_docs = [] with jsonlines.open(args.train_file) as f: for line in f.iter(): lines_len += 1 sentences = [] for sentence in line['article_original']: sentences.append(sentence) src_docs.append(" ".join(sentences).replace('\n', '') + "\n") lens = [] tr_max_src = 0 for i, src_doc in enumerate(src_docs): if i % 100 == 0: print(i, len(src_docs)) tokens = tokenizer(src_doc) cur_len = len(tokens) lens.append(cur_len) if tr_max_src < cur_len: tr_max_src = cur_len src_docs = [] with jsonlines.open(args.test_file) as f: for line in f.iter(): lines_len += 1 sentences = [] for sentence in line['article_original']: sentences.append(sentence) src_docs.append(" ".join(sentences).replace('\n', '') + "\n") max_src = 0 test_lens = [] for i, src_doc in enumerate(src_docs): if i % 100 == 0: print(i, len(src_docs)) tokens = tokenizer(src_doc) cur_len = len(tokens) test_lens.append(cur_len) if max_src < cur_len: max_src = cur_len print("max source length train", tr_max_src) print("max source length test", max_src) print(sum(lens) / len(lens)) print(sum(test_lens) / len(test_lens)) import numpy as np print(np.median(np.array(lens))) print(np.median(np.array(test_lens))) print("done")
def __init__(self, filename, vocab, maxlen, use_emotion): #Store the contents of the file in a pandas dataframe self.df = pd.read_csv(filename, header=0, encoding='utf-8') # self.df = pd.read_csv(filename, delimiter = '\t') self.sp = SentencepieceTokenizer(get_tokenizer()) self.vocab = vocab self.maxlen = maxlen self.use_emotion = use_emotion self.sp.tokens.index('!')
def test_loader(dtls, max_len, batch_size, num_workers): tokenizer = get_tokenizer() _, vocab = get_pytorch_kobert_model() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) data_test = BERTDataset(dtls, 0, 1, tok, max_len, True, False) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=num_workers) return test_dataloader
def __init__(self): self.device = torch.device("cuda:0") self.bertmodel, self.vocab = get_pytorch_kobert_model() bertmodel, vocab = get_pytorch_kobert_model() # 토큰화 tokenizer = get_tokenizer() self.tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) self.max_len = 64 self.batch_size = 64
def load_model(self): self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device) self.model = BERTClassifier(self.bert_model, dr_rate=self.dropout_rt).to(self.device) if self.get_weights: print("get model from pretrained weigths") self.model.load_state_dict( torch.load(self.model_save_path, map_location=self.device)) self.tokenizer = get_tokenizer() self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer, self.vocab, lower=False)
def __init__(self, train_path, test_path, kaggle_path, use_all): device = torch.device("cuda:0") bertmodel, vocab = get_pytorch_kobert_model() self.model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) dataset_train = nlp.data.TSVDataset(train_path, field_indices=[1, 2], num_discard_samples=1) dataset_test = nlp.data.TSVDataset(test_path, field_indices=[1, 2], num_discard_samples=1) dataset_kaggle = nlp.data.TSVDataset(kaggle_path, field_indices=[1], num_discard_samples=1, encoding='cp949') tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) data_train = BERTDataset(dataset_train, 0, 1, tok, config.max_len, True, False) data_test = BERTDataset(dataset_test, 0, 1, tok, config.max_len, True, False) data_kaggle = BERTDataset(dataset_kaggle, 0, 1, tok, config.max_len, True, False, kaggle=True) self.train_dataloader = torch.utils.data.DataLoader( data_train, batch_size=config.batch_size, num_workers=5) self.test_dataloader = torch.utils.data.DataLoader( data_test, batch_size=config.batch_size, num_workers=5) self.kaggle_dataloader = torch.utils.data.DataLoader(data_kaggle, batch_size=1, num_workers=5) if use_all: dataset_all = nlp.data.TSVDataset(config.all_path, field_indices=[1, 2], num_discard_samples=1) data_all = BERTDataset(dataset_all, 0, 1, tok, config.max_len, True, False) self.all_dataloader = torch.utils.data.DataLoader( data_all, batch_size=config.batch_size, num_workers=5, shuffle=True)
def __init__(self, tokenizer_s='spacy'): """ bert-multi, kbalbert : [PAD], [CLS], ... :param tokenizer: string to represent tokenizer like 'spacy', 'bert', ... Example:: nlp = English() tokenizer = nlp.Defaults.create_tokenizer(nlp) tokenizer = Tokenizer(tokenizer) """ if type(tokenizer_s) is str: self.tokenizer_s = tokenizer_s if tokenizer_s == 'spacy': self.nlp = spacy.load( "en_core_web_md") # md, large have embed vectors self.tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp) elif tokenizer_s == 'bert-multi': from transformers import BertTokenizer, BertModel, BertConfig self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') self.vocab = self.tokenizer.vocab elif tokenizer_s == 'sktkobert': import gluonnlp as nlp from kobert.utils import get_tokenizer from kobert.pytorch_kobert import get_pytorch_kobert_model kobert, vocab = get_pytorch_kobert_model() self.tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), vocab, lower=False) self.vocab = vocab elif tokenizer_s == 'kbalbert': import sys sys.path.append( '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/') from transformers import AlbertModel, TFAlbertModel from tokenization_kbalbert import KbAlbertCharTokenizer model_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model' self.tokenizer = KbAlbertCharTokenizer.from_pretrained(model_path) self.vocab = self.tokenizer.vocab else: if type(tokenizer_s) is str: from transformers import BertTokenizer, BertModel, BertConfig self.tokenizer = BertTokenizer.from_pretrained(tokenizer_s) self.vocab = self.tokenizer.vocab elif type(tokenizer_s) is not str: self.tokenizer = tokenizer_s self.tokenizer_s = 'custom' else: raise Exception('check tokenizer is correctly defined') self.pre_trained = self.tokenizer_s
def bert_test(opt): device = torch.device('cuda:{}'.format(opt.device)) model = torch.load(opt.weights) model.to(device) # model = nn.DataParallel(model, output_device=[0,1]) bertmodel, vocab = get_pytorch_kobert_model() model.eval() # 평가 모드로 변경 def calc_accuracy(X, Y): max_vals, max_indices = torch.max(X, 1) train_acc = (max_indices == Y).sum().data.cpu().numpy() / max_indices.size()[0] return train_acc tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 256 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음 batch_size = opt.batch warmup_ratio = 0.1 num_epochs = 2 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 dataset_test = nlp.data.TSVDataset(opt.source, field_indices=[1, 2], num_discard_samples=1) data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5) test_acc = 0.0 df = pd.DataFrame(columns=['pred', 'label']) pred = np.array([]) # answer = np.array([]) for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length = valid_length # label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) _, max_idx = torch.max(out, 1) pred = np.append(pred, max_idx.cpu().detach().tolist()) # answer = np.append(answer,label.cpu().detach().tolist()) # test_acc += calc_accuracy(out, label) # print(len(pred)) df['pred'] = pred # df['label'] = answer df.to_csv(opt.save_csv_name, index=False)
def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir
def __init__(self, samples, vocab, media_map, word_dropout_prob=0.0, max_word_dropout_ratio=0.0, max_token_cnt=300): self.tokenizer = SentencepieceTokenizer(get_tokenizer()) self.vocab = vocab self.samples = samples self.targets = [s[1] for s in samples] self.media_map = media_map self.word_dropout_prob = word_dropout_prob self.max_word_dropout_ratio = max_word_dropout_ratio self.max_token_cnt = max_token_cnt
def BERT_inference(text): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _, vocab = get_pytorch_kobert_model(device) tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 80 batch_size = 64 warmup_ratio = 0.1 num_epochs = 10 max_grad_norm = 1 log_interval = 30 learning_rate = 5e-5 # 1. 로드방법 : 학습한 파라미터만 Load하는 방법 #new_save_path = 'v3_model_only_parameter_0302.pt' #model = BERTClassifier(bertmodel, dr_rate=0.1) #model.load_state_dict(new_save_path) #model.eval() # 2. 로드방법 : 모델 전체 저장한것 Load save_path = 'v2_model_0302.pt' model = torch.load(save_path) model.eval() infer_data = BERTDataset_infer(text, 0, tok, max_len, True, False) infer_data = torch.tensor(next(iter(infer_data))[0]).reshape(1, -1) segments_tensors = torch.zeros(len(infer_data[0])) segments_tensors = segments_tensors.reshape(1, -1) valid_length = torch.tensor(len(infer_data[0])) valid_length = valid_length.reshape(1, -1) infer_data = infer_data.long().to(device) segments_tensors = segments_tensors.long().to(device) valid_length = valid_length.long().to(device) with torch.no_grad(): outputs = model(infer_data, valid_length, segments_tensors) print("딥러닝 최종 inference : ", torch.argmax(outputs[0])) return torch.argmax(outputs[0])
def data_loader(dtls, max_len, batch_size, num_workers): dataset_train, dataset_test = train_test_split(dtls, test_size=0.2, random_state=123) tokenizer = get_tokenizer() _, vocab = get_pytorch_kobert_model() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False) data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False) train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=num_workers) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=num_workers) return train_dataloader, test_dataloader
def calc_vars(df): df['jamo_levenshtein'] = df.apply( lambda row: jamo_levenshtein(row['original'], row['corrected']), axis=1) tok_path = get_tokenizer() sp = SentencepieceTokenizer(tok_path) df['0_tokens'] = df['original'].apply(lambda x: len(sp(x))) df['1_tokens'] = df['corrected'].apply(lambda x: len(sp(x))) df['1_token/0_token'] = df['1_tokens'] / df['0_tokens'] df['min_tokens'] = df[['0_tokens', '1_tokens']].min(axis=1) df = df[df['min_tokens'] > 0] df['log_tokens'] = df['min_tokens'].apply(lambda x: math.log(x, 20)) df['ratio'] = df['jamo_levenshtein'] / df['min_tokens'] * df['log_tokens'] df['0_len'] = df['original'].apply(lambda x: len(x)) df['1_len'] = df['corrected'].apply(lambda x: len(x)) df['len_ratio'] = df['1_len'] / df['0_len'] return df
def convert_input_data(sentences): test_data = [sentences] print(test_data) tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 128 test_data = BERTDataset(test_data, 0, tok, max_len, True, False) dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, num_workers=1) for token_ids, valid_length, segment_ids in dataloader: token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length = valid_length result = model(token_ids, valid_length, segment_ids) # return "ge" return result
def __init__(self, path: str, max_seqlen: int = 512, ignore_index=-100) -> None: super(TokenizedDataset, self).__init__() with open(path, "rb") as f: self.data = pickle.load(f) self.max_len = max_seqlen _, self.vocab = get_pytorch_kobert_model() tok = get_tokenizer() self.tokenizer = nlp.data.BERTSPTokenizer(tok, self.vocab, lower=False) if "train" in path: self.data["token"] = self.data["token"][:100000] self.data["tgt"] = self.data["tgt"][:100000] self.tokens = self.data["token"] self.labels = self.data["tgt"] self.cls_idx = self.vocab["[CLS]"] self.pad_idx = self.vocab["[PAD]"] self.sep_idx = self.vocab["[SEP]"] self.mask_idx = self.vocab["[MASK]"] self.ignore_idx = ignore_index
def get_dataloaders(dataset_train, dataset_test, vocab, batch_size, max_len=64, class_labels=['0', '1']): tokenizer = get_tokenizer() bert_tokenizer = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) # for single sentence classification, set pair=False # for regression task, set class_labels=None # for inference without label available, set has_label=False transform = BERTDatasetTransform(bert_tokenizer, max_len, class_labels=class_labels, has_label=True, pad=True, pair=False) data_train = dataset_train.transform(transform) data_test = dataset_test.transform(transform) train_sampler = nlp.data.FixedBucketSampler( lengths=[int(item[2]) for item in data_train], batch_size=batch_size, shuffle=True) train_dataloader = gluon.data.DataLoader(data_train, batch_sampler=train_sampler) test_sampler = nlp.data.FixedBucketSampler( lengths=[int(item[2]) for item in data_test], batch_size=batch_size, shuffle=True) test_dataloader = mx.gluon.data.DataLoader(data_test, batch_sampler=test_sampler) return train_dataloader, test_dataloader, bert_tokenizer
def main(args): root = args.path mode = args.mode dset = load_data(root, mode) _, vocab = get_pytorch_kobert_model() tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) tokenized_dset = [] start_time = time.time() for d in tqdm(dset): tokenized_dset.append(tokenize(d, tok)) print("--- %s seconds for tokenizing ---" % (time.time() - start_time)) start_time = time.time() result = {"token": [], "tgt": []} for idx, data in tqdm(enumerate(tokenized_dset)): src = " ".join([" ".join(d) for d in data["tokenized_src"]]).split(" ") tgt = " ".join([" ".join(d) for d in data["tokenized_abs"]]).split(" ") auxiliary_tgt = make_aux_tgt(src, tgt) assert len( src) == len(auxiliary_tgt ), f"Length mismatch: {len(src)}, {len(auxiliary_tgt)}" result["token"].append(src) result["tgt"].append(auxiliary_tgt) print("--- %s seconds for generating labels ---" % (time.time() - start_time)) with open(f"{args.save_path}/contentselection_{mode}.pickle", "wb") as f: pickle.dump(result, f) print("--- Finished ---")
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) # data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model ptr_tokenizer = SentencepieceTokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token_to_idx = vocab_of_gluonnlp.token_to_idx model_config.vocab_size = len(token_to_idx) vocab = Vocabulary(token_to_idx=token_to_idx) print("len(token_to_idx): ", len(token_to_idx)) with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f: json.dump(token_to_idx, f, ensure_ascii=False, indent=4) # save vocab & tokenizer with open(model_dir / "vocab.pkl", 'wb') as f: pickle.dump(vocab, f) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir) # Train & Val Datasets cwd = Path.cwd() data_in = cwd / "data_in" train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명" tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir) tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn) tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # Model model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index)) model.train() # optim train_examples_len = len(tr_clf_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(tr_clf_ds)) logger.info(" Num Epochs = %d", model_config.epochs) logger.info(" Instantaneous batch size per GPU = %d", model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() set_seed() # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if model_config.gradient_accumulation_steps > 1: loss = loss / model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm) tr_loss += loss.item() if (step + 1) % model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0: # Log metrics if model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step) logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step)) logging_loss = tr_loss if model_config.save_steps > 0 and global_step % model_config.save_steps == 0: # Save model checkpoint output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) state = {'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict()} summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint(state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc)) else: torch.save(state, os.path.join(output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc))) tb_writer.close() logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) return global_step, tr_loss / global_step, best_steps