def name_tokenize(sentence): api = kh.KhaiiiApi() api.open() x = re.sub('[-\.\$\+>#\}\{\*<&@%;\\\)\(="?!\[\]~\^/:,_\|]+', '.', str(sentence).replace("'", "")).replace('.', ' ') # 특수 기호 제거 res = [] try: for word in api.analyze(x): if re.match('\d+', word.lex) != None: tmp = '' for m in word.morphs: # '2000년대','2000년대의'에서 '2000년대'만 뽑아내는 코드 if m.tag in ['NNG', 'NNP', 'NNB', 'SN']: tmp += m.lex res.append(tmp) else: a = len(res) for i in genre: if i in word.lex: res.append(i) # '발라드', '메탈'처럼 장르명이 제목에 있으면 바로 리스트에 추가 for j in artist: if j in word.lex and len( j) > 1: # 두 글자 이상의 아티스트 이름이 단어에 들어있으면 리스트에 추가 res.append(j) b = len(res) if a == b: for m in word.morphs: if m.tag in ['NNG', 'NNP', 'NNB', 'SN', 'SL']: # 명사와 숫자, 외국어만 받습니다 res.append(m.lex) except: pass return ' '.join(res)
def preprocessing(data): try: from konlpy.tag import Okt, Kkma import khaiii khaiii_api = khaiii.KhaiiiApi(opt.khaiii_so_path) khaiii_api.open(opt.khaiii_path) kkma = Kkma() kkma_tokenizer = kkma.nouns twitter = Okt() okt_tokenizer = twitter.nouns cls, data_path_list, div, out_path, begin_offset, end_offset = data data = cls() data.load_y_vocab() data.preprocessing(data_path_list, div, begin_offset, end_offset, out_path, okt_tokenizer, khaiii_api, kkma_tokenizer) except Exception: raise Exception("".join(traceback.format_exception(*sys.exc_info())))
def kakao_postagger_nn_finder(summay_text): api = khaiii.KhaiiiApi() api.open() nn_word_list = [] for word in api.analyze(summay_text): morphs_str = ' + '.join([(m.lex + '/' + m.tag) for m in word.morphs]) # print(f'{word.lex}\t{morphs_str}') morphs_str_list = morphs_str.split(" + ") complex_morphs = "" for mophs_item in morphs_str_list: if mophs_item.split("/")[1].startswith("N") or mophs_item.split("/")[1].startswith("MM") or \ mophs_item.split("/")[1].startswith("SN") or mophs_item.split("/")[1].startswith("SL"): complex_morphs = complex_morphs + mophs_item.split("/")[0] if len(complex_morphs) > 1: # print("->", complex_morphs) nn_word_list.append(complex_morphs) return nn_word_list
def sent2khaiii(data, tag=False): ''' 카이 형태소 분석기 data: 데이터 (str or list(Series)) tag: 태그 포함 여부 (False or True) 예시 khaii(["우리집에 왜 왔니", "왜 왔니"]) >>> [['우리', '집', '에', '왜', '오', '았', '니'], ['왜', '오', '았', '니']] khaii(["우리집에 왜 왔니", "왜 왔니"], tag=True) >>> [['우리/NP', '집/NNG', '에/JKB', '왜/MAG', '오/VV', '았/EP', '니/EC'], ['왜/MAG', '오/VV', '았/EP', '니/EC']] ''' import khaiii api = khaiii.KhaiiiApi() api.open() if type(data) == str: data = [data] return [[ a.lex + "/" + a.tag if tag == True else a.lex for word in api.analyze(str(sent)) for a in word.morphs ] if str(sent).strip() else (sent) for sent in data]
def setUp(self): self._api = khaiii.KhaiiiApi() self._api.set_log_level('all', 'warn') self._api.open()
import http.client, urllib.parse from flask import (Flask, request, abort, render_template, Response, jsonify) from flask_cors import CORS import nltk from wiktionaryparser import WiktionaryParser from tagmap import TagMap from chunker import Chunker # ---------- initialize KHaiii phoneme analyzer # set up KHaiii api import khaiii khaiiiAPI = khaiii.KhaiiiApi() khaiiiAPI.open() # ---------- instantiate Flask (global) app -------- parserApp = Flask('app', static_folder="./dist/static", template_folder="./dist") CORS(parserApp) # parserApp.config.update(DEBUG=True, SECRET_KEY="iu877hy3jnd8**yd98y334%$#Rjxhg6222", SESSION_COOKIE_HTTPONLY=False) def run_dev_server():
def __init__(self): self.tokenizer = khaiii.KhaiiiApi() super().__init__(self.tokenizer.analyze)
import khaiii api = khaiii.KhaiiiApi() api.open() class Khaiii(): def pos(self, phrase, flatten=True, join=False): """POS tagger. :param flatten: If False, preserves eojeols. :param join: If True, returns joined sets of morph and tag. """ sentences = phrase.split('\n') morphemes = [] if not sentences: return morphemes for sentence in sentences: for word in api.analyze(sentence): result = [(m.lex, m.tag) for m in word.morphs] if join: result = [ '{}/{}'.format(m.lex, m.tag) for m in word.morphs ] morphemes.append(result) if flatten: return sum(morphemes, [])