def get_mecab(self): try: return mecab.MeCab() except Exception as e: raise Exception( 'If you want to install mecab, The command is... pip install python-mecab-ko' )
def train(): # tokenizer meCab = mecab.MeCab() train_X, train_y, test_X, test_y = make_data('corpus', testprob=0.1) #print(len(train_X), len(train_y)) print('--- Get vocabulary') with open('vocab.pickle', 'rb') as f: vocab = pickle.load(f) print('--- Load vocabulary successfully') print('%% Vacabulary size:', len(vocab)) count_vect = CountVectorizer(tokenizer=meCab.morphs, ngram_range=(1, 3), max_features=10000, vocabulary=vocab) X_train_counts = count_vect.transform(train_X) print("The number of features: {}".format(X_train_counts.shape[1])) tfidf_transformer = TfidfTransformer(use_idf=False, smooth_idf=False, norm='l2') X_train_tfidf = tfidf_transformer.transform(X_train_counts) #print(X_train_tfidf.shape) # Naive Beyesian # clf = MultinomialNB().fit(X_train_tfidf, train_y) # SVM clf_svm = SGDClassifier().fit(X_train_tfidf, train_y) # Evaluation X_test_counts = count_vect.transform(test_X) X_test_tfidf = tfidf_transformer.transform(X_test_counts) print() predicted = clf_svm.predict(X_test_tfidf) print("SVM: ", np.mean(predicted == test_y)) print() print("Examples: ") print("Input\t Predicted\t Correct") for i in range(3): print("%s\t=> %s\t: %s" % (test_X[i], predicted[i], test_y[i])) # model save # first, delete old model os.remove('model/hmc.model') print() with open('model/hmc.model', 'wb') as f: pickle.dump(clf_svm, f) #pickle.dump(count_vect, open('model/count.pickle', 'wb')) #pickle.dump(X_train_tfidf, open('model/train_feature.pickle', 'wb')) #pickle.dump(X_test_tfidf, open('model/test_feature.pickle', 'wb')) print('SVM classifier model saved at "model/hmc.model"') print('If you want to load the model, use "pickle.load" in python.')
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format( word, 'T', word) else: dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format( word, 'F', word) # print("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words)) + '\n')
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "brainbert" in self.config.n_model: try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) from pororo.models.brainbert import BrainRobertaModel from pororo.utils import postprocess_span model = (BrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) tagger = mecab.MeCab() return PororoBertMrc(model, tagger, postprocess_span, self.config)
def test_modules(self): try: import mecab tagger = mecab.MeCab() res = postprocess_span(tagger, "이민자들은") self.assertIsInstance(res, str) self.assertEqual(res, "이민자들") res = postprocess_span(tagger, "8100억원에") self.assertIsInstance(res, str) self.assertEqual(res, "8100억원") res = postprocess_span(tagger, "1960년대부터") self.assertIsInstance(res, str) self.assertEqual(res, "1960년대") res = postprocess_span(tagger, "군사 목적으로는,") self.assertIsInstance(res, str) self.assertEqual(res, "군사 목적") except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" )
def __init__(self, **kwargs): self.system = kwargs['system'] self.datetimeParser = self.getParser() self.mecab = mecab.MeCab() if 'topKw' in kwargs: self.topKw = kwargs['topKw'] else: self.topKw = 30
def mecab_tokenize(corpus_fname, output_fname): mcab = mecab.MeCab() with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() tokens = mcab.morphs(sentence) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue soy_tokens = soy_tokenizer.tokenize(word) if ' '.join(tokens) == ' '.join(soy_tokens): continue if is_all_nng(mcab.pos(word)): #print("nouns only : {}".format(word)) #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) continue if len(soy_tokens) > 1: continue #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
def decode(sentence): # tokenizer meCab = mecab.MeCab() print('--- Get vocabulary') try: with open('vocab.pickle', 'rb') as f: vocab = pickle.load(f) except FileNotFoundError: print('Loading vocabulary ERROR. There is no vocabulary.') return None print('--- Load vocabulary successfully: vocab.pickle') print('%% Vocabulary size:', len(vocab)) print(vocab) print() try: with open('model/hmc.model', 'rb') as f: model = pickle.load(f) print("--- Loading model Successfully: model/hmc.model") except FileNotFoundError: print("Loading model Failed. There is no model.") return None print(model) print() #count_vect = pickle.load(open('model/count.pickle', 'rb')) count_vect = CountVectorizer(tokenizer=meCab.morphs, ngram_range=(1, 3), max_features=10000, vocabulary=vocab) tfidf_vect = TfidfVectorizer(tokenizer=meCab.morphs, ngram_range=(1, 3), max_features=10000, vocabulary=vocab) # vectorize sent_counts = count_vect.transform([sentence]) tfidf_transformer = TfidfTransformer(use_idf=False, smooth_idf=False, norm='l2') sent_tfidf = tfidf_transformer.transform(sent_counts) pred = model.predict(sent_tfidf) print('Input:', sentence) print('Prediction:', pred) return pred
def tokenizer(sent): meCab = mecab.MeCab() result = [] tags = meCab.parseToNode(sent) while tags: #output = '%s/%s' % (tags.surface, tags.feature.split(',')[0]) if tags.surface: result.append(tags.surface) tags = tags.next return result
def __init__(self, rawpath, isEng=False): self.isEng = isEng # inflect engine: number_to_string(int) self.p = inflect.engine() # mecab class self.meCab = mecab.MeCab() # Loading raw corpus file self.raw = dc.load_raw(rawpath) self.contents = self.load_contents('work/contents/')
def get_mecab(self): if os.name == 'nt': from konlpy.tag import Mecab mecab_ko_dic_path = os.environ[ 'MECAB_KO_DIC_PATH'] if 'MECAB_KO_DIC_PATH' in os.environ else r"C:\mecab\mecab-ko-dic" return Mecab(dicpath=mecab_ko_dic_path) else: import mecab try: return mecab.MeCab() except Exception as e: raise Exception( 'If you want to install mecab, The command is... pip install python-mecab-ko' )
def __init__(self, mode = 'serve', data_path='./dataset/train.tsv'): self.mode = mode self.data_path = data_path self.train_corpus = None if(self.mode == 'train'): dataset = pd.read_table(self.data_path) dataset = dataset[['titles', 'labels']] self.train_corpus = dataset['titles'].tolist() # python -m spacy download en # python -m spacy download en_core_web_sm # python -m spacy link en_core_web_sm en self.tokenizer = self.soynlp_tokenizer() self.nlp = spacy.load('en') self.mecab = mecab.MeCab() self.okt = Okt()
def load(self, device: str): if "brainbert" in self.config.n_model: try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) # from pororo.models.brainbert import BrainRobertaModel from pororo.utils import postprocess_span model = My_BrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang).eval().to(device) tagger = mecab.MeCab() return PororoBertMrc(model, tagger, postprocess_span, self.config)
def _init_tagger(self, taggername): global usingkonlpy if not usingkonlpy and taggername != "Mecab": from konlpy.tag import Okt, Mecab, Komoran usingkonlpy = True if taggername == "Okt": self.tagger = konlpy.tag.Okt() self.tagsOkt = True elif taggername == "Mecab": if usingkonlpy: # Use Mecab(dicpath="c:/some/path/mecab-ko-dic") for a # non-default location. (?? mecab uses rcfile and dicdir not # dicpath) self.tagger = konlpy.tag.Mecab() else: self.tagger = mecab.MeCab() self.tagsMecab = True elif taggername == "Komoran": self.tagger = konlpy.tag.Komoran() self.tagsKomoran = True else: raise Exception("Bad tagger name " + taggername)
def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") logger.debug('Will load from {0}'.format(model_dir)) # Read model serialize/pt file x = hub_utils.from_pretrained(model_dir, "model.pt", DATA_PATH, load_checkpoint_heads=True) model_interface = BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], model_dir, ).to(self.device) tagger = mecab.MeCab() self.model = PororoBertMrc( model_interface, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad")) # Read the mapping file, index to object name # mapping_file_path = os.path.join(model_dir, "index_to_name.json") # if os.path.isfile(mapping_file_path): # with open(mapping_file_path) as f: # self.mapping = json.load(f) # else: # logger.warning('Missing the index_to_name.json file. Inference output will not include class name.') self.initialized = True
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 100 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
def make_vocab(corpus_path, save_path): meCab = mecab.MeCab() fl = glob.glob(corpus_path + "/*.txt") all_morphs = [] all_tri = [] all_bi = [] for fn in fl: with open(fn, 'r', encoding='utf-8') as f: raw = f.readlines() for s in raw: morphs = meCab.morphs(s) for m in morphs: all_morphs.append(m) for w1, w2, w3 in trigrams(s, pad_left=True, pad_right=True): all_tri.append((w1, w2, w3)) for w1, w2 in bigrams(s, pad_left=True, pad_right=True): all_bi.append((w1, w2)) #all_morphs = nltk.FreqDist(w for w in all_morphs) #uni = list(all_morphs)[:200] #all_tri = nltk.FreqDist(tri for tri in all_tri) #tri = list(all_tri)[:200] #all_bi = nltk.FreqDist(bi for bi in all_bi) #bi = list(all_bi)[:200] #vocab = uni + tri + bi vocab = list(set(all_morphs)) with open(save_path, 'wb') as f: pickle.dump(vocab, f) print("--- Vacabulary saved in", save_path) print('%% The size of vocabulary:', len(vocab))
def __init__(self, src_lang, tgt_lang, phrase_maxlen): langs = {src_lang, tgt_lang} if "ja" in langs: # pip install mecab-python3 unidic-lite import MeCab self.ja_tagger = MeCab.Tagger() if "ko" in langs: # pip install python-mecab-ko import mecab self.ko_tagger = mecab.MeCab() alnum = "0-9A-Za-z\xC0-\xFF" # Latin-1 alnum += "\u0100-\u017F" # Latin Extended-A alnum += "\u0180-\u024F" # Latin Extended-B alnum += "\u1E00-\u1EFF" # Latin Extended Additional alnum += "\uAC00-\uD7AF" # Hangul Syllables self.RE_NAN_L = re.compile("(?<=[^ %s])(?=[%s])" % (alnum, alnum)) self.RE_NAN_R = re.compile("(?<=[%s])(?=[^ %s])" % (alnum, alnum)) self.phrase_maxlen = phrase_maxlen
def PosWithSpace(sentence, mecab_ko=None, extend=False): if mecab_ko == None: mecab_ko = mecab.MeCab() #pos태그된곳에 띄어쓰기 추가 (태그는 SPACE) pos = mecab_ko.pos(sentence) sentence_sliced = sentence pos_with_space = [] for p in pos: while sentence_sliced[0] == ' ': sentence_sliced = sentence_sliced[1:] pos_with_space.append((' ', 'SPACE')) sentence_sliced = sentence_sliced[len(p[0]):] pos_with_space.append(p) if extend: pos_extended = [] for p in pos_with_space: pos_extended += [('', pp) for pp in p[1].split('+')] pos_extended[len(pos_extended) - 1] = (p[0], pos_extended[len(pos_extended) - 1][1]) return pos_extended else: return pos_with_space
# for i in range(0,len(json_data["sentence"][j]["dependency"])): # #json_lemma = json_data["sentence"][j]["morp"][i]["lemma"] # json_lemma = json_data["sentence"][j]["dependency"][i]["text"] # #json_type = json_data["sentence"][j]["morp"][i]["type"] # json_type = json_data["sentence"][j]["dependency"][i]["label"] # print(json_lemma + "\t" + json_type) # print("\n") ############################################################################################################### ####################################### 엑소브레인 데이터 개체명 추출 코드 ########################################### import os import sys import mecab os.chdir(os.getcwd()) mecab = mecab.MeCab() sys.stdout = open("NEW_EXOBRAIN_NE_CORPUS_10000.txt", "w", -1, "utf-8") f = open("EXOBRAIN_NE_CORPUS_10000.txt", "rt", -1, "utf-8") while True: line = f.readline() if not line: break tokened_str = mecab.morphs(line) check = 0 lemma = [] type = []
def __init__(self, req): ################################### # Request form # 'utt': User input utterance # 'code': Status Code # API Server Code # '7000': first input # '7001': No necessery slots (Control_Engine_Start only) # '8000': After pin input, yet processed previous command # # Bluelink Server Code # '200': Vehicle control Successed # '4002': Invalid Request Body # '4003': Invalid pin # '4004': Duplicate request # '4005': Unsupported control request # '4011': Invalid access token # '4081': Request timeout # '5001': Internal Server Error # '5031': Service Temporary Unavailable # '5041': Gateway timeout # 'user_key': Identifier for users #################################### # 형태소 분석기 Mecab 불러오기 self.mecab = mecab.MeCab() # API 서버에서 요청은 json 형식으로 전달된다. # 전달된 json을 parsing 해서 dictionary로 활용함 # json 모듈 활용 if type(req) == str: req_body = json.loads(req) else: req_body = req # API 서버에서 받은 요청을 활용하기 쉽도록 개별 변수에 저장한다. # utt: 사용자 입력 발화 # code: 현재 상태를 나타내는 Status 코드 자세한 내용은 위의 주석 참조 # user_key: 발화를 입력한 사용자를 구분하기 위한 id. 암호화 되어있음 self.utt = req_body['utt'] self.code = req_body['code'] self.user_key = req_body['user_key'] # 대화의 Depth 가 깊어질 경우, # 보존해야하는 정보가 발생함 # (예, 시동 걸기 시에 온도 정보는 pin 입력 시까지 보존할 필요가 있음) # 이 정보는 req['options']에 저장됨 # options 목록 # 1. 실내 온도 ('temp') self.options = req_body['options'] # LM Rule을 확인하기 위해서는 형태소 분석이 필요하다. # 사용자 입력 발화를 형태소 분석하는 코드 # 이렇게 분석된 발화는 다음과 같은 형태를 가지게 된다. # (예) # 입력 발화: 시동 걸어 # pos: [ # ('시동', 'NNG'), # ('걸', 'VV'), # ('어', 'EC'), # ] self.pos = self.mecab.pos(self.utt) # LM Rule 및 Response form 이 정리된 파일 불러오기 # 이후로 DB 로 대체도 가능 # 불러온 정보는 dictionary 형식으로 저장됨 # 자세한 형식은 make_rule.py 참조 # DB 교체 self.dm = self.get_DM_from_DB() # 사용자 발화에서 intention 을 찾는 코드 # intention 찾기에 실패한 경우, False 가 저장됨 # 현재 pin 이나 온도가 입력될 경우 False 가 저장되고 있음 self.intention = self.__get_intention__()
def get(self): args = request.args received_message = args['msg'] analysis_frame = select_analysis_table_frame() print(analysis_frame) question_nouns = '' final_slot_max_tfidf_value = 0.0 final_slot_nouns = '' analysis_frame['slot_str'] = pd.Series(analysis_frame['slot_1'] + ' ' + analysis_frame['slot_2'] + ' ' + analysis_frame['slot_3'] + ' ' + analysis_frame['slot_4']) print(analysis_frame['slot_str']) tagger = mecab.MeCab() print("메캡 객체 생성 완료.") nouns = tagger.nouns(received_message) print("메캡 동사 추출 완료") for noun in nouns: question_nouns += noun + ' ' result = "" print('question : ' + received_message) print('question_nouns : ' + question_nouns + '\n') for index, row in analysis_frame.iterrows(): slots_nouns = '' nouns = tagger.nouns(row['slot_str']) for noun in nouns: slots_nouns += noun + ' ' calc_list = list() calc_list.append(question_nouns) calc_list.append(slots_nouns) print('slots calc data : ' + str(calc_list) + '\n') tfidf_vectorizer = TfidfVectorizer(min_df=1) tfidf_matrix = tfidf_vectorizer.fit_transform(calc_list) document_distances = (tfidf_matrix * tfidf_matrix.T) print('slots and question - document_distances : ' + str(document_distances.toarray()[0][1])) if document_distances.toarray()[0][1] >= 0.8: result += str(row['final_slot']) + ' at ' + str( row['upload_date']) + ' / ' nouns = tagger.nouns(row['final_slot']) final_slot_nouns = slots_nouns + ' ' for noun in nouns: final_slot_nouns += noun + ' ' calc_list = list() calc_list.append(question_nouns) calc_list.append(final_slot_nouns) print('final slot calc data : ' + str(calc_list) + '\n') tfidf_vectorizer = TfidfVectorizer(min_df=1) tfidf_matrix = tfidf_vectorizer.fit_transform(calc_list) document_distances = (tfidf_matrix * tfidf_matrix.T) print('final slot and question - document_distances : ' + str(document_distances.toarray()[0][1])) if document_distances.toarray()[0][1] >= 0.85: result = row['data_content'] break if result == "": result = "일치할 것으로 예상되는 값이 없습니다." # 딕셔너리로 변환 # json_dict = dict(json.loads(return_m)) # print(json_dict.keys()) # print(json_dict.values()) print("받은 메세지 :" + received_message) return jsonify({'result': result})
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "nltk": import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") try: nltk.data.find("taggers/averaged_perceptron_tagger") except LookupError: nltk.download("averaged_perceptron_tagger") return PororoNLTKPosTagger(nltk, self.config) if self.config.n_model == "mecab-ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabPos(model, self.config) if self.config.n_model == "mecab-ipadic": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabJap(model, self.config) if self.config.n_model == "jieba": try: import jieba # noqa except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") import jieba.posseg as jieba_pos model = jieba_pos return PororoJieba(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "sent" in self.config.n_model: import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") from nltk.tokenize import sent_tokenize return PororoSentTokenizer(sent_tokenize, self.config) if self.config.n_model == "mecab_ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabKoTokenizer(model, self.config) if self.config.n_model == "char": return PororoCharTokenizer(self.config) if self.config.n_model == "jamo": return PororoJamoTokenizer(self.config) if self.config.n_model == "word": return PororoWordTokenizer(self.config) if self.config.n_model == "roberta": from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = download_or_load("misc/encoder.json", self.config.lang) vocab = download_or_load("misc/vocab.bpe", self.config.lang) model = get_encoder(encoder, vocab) with open(encoder, "r") as f_vocab: vocab = json.load(f_vocab) inv_dict = {v: k for k, v in vocab.items()} return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config) if self.config.n_model == "moses": try: from sacremoses import MosesDetokenizer, MosesTokenizer except ModuleNotFoundError as error: raise error.__class__( "Please install sacremoses with: `pip install sacremoses`") model = MosesTokenizer(lang="en") detok = MosesDetokenizer(lang="en") return PororoMosesTokenizer(model, detok, self.config) if self.config.n_model == "jieba": try: import jieba except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") model = jieba.cut return PororoJiebaTokenizer(model, self.config) if self.config.n_model == "mecab": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabTokenizer(model, self.config) else: from pororo.tasks.utils.tokenizer import CustomTokenizer path = download_or_load( f"tokenizers/{self.config.n_model}.zip", self.config.lang, ) ext = "json" if "unigram" not in self.config.n_model else "txt" merges_filename = (f"{path}/merges.txt" if "unigram" not in self.config.n_model else None) model = CustomTokenizer.from_file( vocab_filename=f"{path}/vocab.{ext}", merges_filename=merges_filename, normalize=True if "jpe" not in self.config.n_model else False, ) if "jpe" in self.config.n_model: return PororoJamoPairTokenizer(model, self.config) if "mecab.bpe" in self.config.n_model: return PororoMecabSPTokenizer(model, self.config) return PororoSPTokenizer(model, self.config)
from fairseq.models.roberta import RobertaHubInterface, RobertaModel import mecab from pororo.models.brainbert.BrainRoBERTa import BrainRobertaHubInterface from pororo.tasks.machine_reading_comprehension import PororoBertMrc from pororo.tasks.utils.base import TaskConfig from pororo.tasks.utils.download_utils import download_or_load from pororo.tasks.utils.tokenizer import CustomTokenizer from pororo.utils import postprocess_span import torch ckpt_dir = download_or_load("bert/brainbert.base.ko.korquad", "ko") tok_path = download_or_load(f"tokenizers/bpe32k.ko.zip", "ko") x = hub_utils.from_pretrained( ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True ) model = BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], tok_path, ).to(torch.device("cuda")) tagger = mecab.MeCab() final = PororoBertMrc(model, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad")) print(final("이름이 뭐야?", "이름은 시리야."))