Ejemplo n.º 1
0
from khaiii import KhaiiiApi
api = KhaiiiApi()

for i in range(10):
    s = str(input())

    res = []
    for word in api.analyze(s):
        res.append(str(word))
    res = [word.split('\t')[1] for word in res]

    print(res)
Ejemplo n.º 2
0
 def __init__(self, df):
     self.df = df
     self.lex = []
     self.tag = []
     self.api = KhaiiiApi()
Ejemplo n.º 3
0
class Tag_parser:
    def __init__(self, soup, url):
        self.tags = []
        self.titles = {}
        self.contents = {}
        self.stopwords = ['<!', 'script', 'function', '#']
        self.api = KhaiiiApi(
            '/home/hwang/khaiii/khaiii/build/lib/libkhaiii.so.0.4',
            '/home/hwang/khaiii/khaiii/build/share/khaiii')
        #self.tables=table_reader.get_all_tables(soup)
        #print(url,self.tables)
        self.table_count = 0
        '''
        self.url = 'http://hosp.ajoumc.or.kr/MedicalInfo/HospitalRoomGuide.aspx'
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome('./chrome/chromedriver_linux64/chromedriver', chrome_options=chrome_options)
        driver.implicitly_wait(3)
        driver.set_page_load_timeout(100)
        driver.get(self.url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        '''

        self.recursiveChildren(soup)

    def isstopWord(self, args):
        for word in self.stopwords:
            if word in args or '\n' == args or str(
                    type(args)
            ) == "<class 'bs4.element.Comment'>":  # ignore comment
                return True

        return False

    def imgTagparse(self, args):
        if 'alt' in args.attrs.keys():
            return args.attrs['alt']
        else:
            return ""

    def dictvalue_to_list(self, dicts):
        res_list = []
        for v in dicts.values():
            if v is None:
                v = []
            elif str(
                    type(v)) == "<class 'bs4.element.NavigableString'>" or str(
                        type(v)) == "<class 'str'>":
                v = list(v.strip().split())
            res_list.append(tuple(v))

        return tuple(res_list)

    def extract_words(self, text):
        temp = []
        if text == '': return temp

        for word in self.api.analyze(text):
            for morph in word.morphs:
                if 'NN' in morph.tag:
                    temp.append(morph.lex)

        if len(temp):
            return temp

    def parents_name(self, link, tag):
        name = []
        if (type(link) == type('')): return False
        for p in link.parents:
            name.append(p.name)

        if (set(tag) & set(name)): return True
        else: return False

    def recursiveChildren(self, x):
        try:
            for child in x.recursiveChildGenerator():
                if self.isstopWord(child):
                    continue
                name = getattr(child, "name", None)

                if name == 'img':
                    child = self.imgTagparse(child)
                    self.tags.append(name)
                    name = None

                if name is not None:

                    if 'li' == name:
                        ##### insertion to contents dict code
                        self.titles['word_from_contents'] = self.extract_words(
                            child.get_text().strip())
                        self.contents[self.dictvalue_to_list(
                            self.titles)] = [child.get_text().strip()]
                        ######
                    # elif 'table' == name:
                    #     ############  write code here . ########################
                    #     temp_table=self.tables[self.table_count]
                    #     self.table_count+=1
                    #     if not temp_table:
                    #         continue
                    #     for line in temp_table:
                    #         try:
                    #             self.titles['word_from_contents'] = self.extract_words(line)
                    #             self.contents[self.dictvalue_to_list(self.titles)] = [line]
                    #             #print(self.titles,' : ', self.contents[self.dictvalue_to_list(self.titles)])
                    #         except:
                    #             pass
                    #     pass
                    else:
                        self.tags.append(name)
                else:
                    if child.isspace() or len(
                            self.tags
                    ) == 0 or child == '':  # lear node, don't print spaces or non-tag
                        continue
                    else:
                        if self.parents_name(child, ['li', 'table']):
                            continue

                        if 'h' in self.tags[-1] or 'img' in self.tags[
                                -1]:  # or 'span' in self.tags[-1]:  # append headline
                            if 'img' in self.tags[-1] and 'h' in self.tags[
                                    -2]:  # img tag in headline
                                self.titles[self.tags[-2]] = child
                            elif 'h' in self.tags[-1]:  # just headline
                                self.titles[self.tags[-1]] = child
                        else:
                            self.titles[
                                'word_from_contents'] = self.extract_words(
                                    child)
                            self.contents[self.dictvalue_to_list(
                                self.titles)] = [
                                    child.strip()
                                ]  # set contents {title : contents}
                            print(
                                self.titles, ' : ',
                                self.contents[self.dictvalue_to_list(
                                    self.titles)])

                    if len(self.tags):
                        self.tags.pop(-1)

        except Exception as ex:
            print("error ", ex)
            return
Ejemplo n.º 4
0
class Tokenizer:
    def __init__(self):
        self._api = KhaiiiApi()
        # 불용어 정의
        self._stopwords = [
            '말', '곡', '때', '음악', '노래', 'a', 'an', 'the', 'in', 'on', 'at',
            'by', 'of'
        ]

        # 대체어
        self._alternative = [
            ('k-pop', 'kpop'),
            ('k팝', 'kpop'),
            ('j-pop', 'jpop'),
            ('r&b', 'rnb'),
            ('알앤비', 'rnb'),
            ('락', 'rock'),
            ('재즈', 'jazz'),
            ('째즈', 'jazz'),
            ('힙합', 'hiphop'),
            ('hip-hop', 'hiphop'),
            ('hip-hap', 'hiphop'),
            ('클래식', 'classic'),
            ('발라드', ' 발라드 '),
            ('라붐', 'laboum'),
            ('뉴에이지', 'newage'),
        ]

    def tokenize(self, sentence):
        clean_sentence = sentence.lower()

        # 대체어로 대체
        for words in self._alternative:
            clean_sentence = re.sub(words[0], words[1], clean_sentence)

        # 영어는 소문자로, 그리고 숫자/영어/한글을 제외한 특수문자 제거(ㅋ 포함)
        clean_sentence = re.sub('[^0-9a-z가-힣]', ' ', clean_sentence)

        morphs = []
        try:
            for word in self._api.analyze(clean_sentence):
                morphs.extend(self._word_tokenize(word))
        except:
            morphs.clear()
            #print('[WARNING] Khaiii can not tokenize...({})'.format(sentence))

        # 불용어 제거
        keyword = {lex for lex, _ in morphs if not lex in self._stopwords}

        return list(keyword)

    def _word_tokenize(self, word):
        morphs = []

        prev_lex = ''
        prev_tag = ''

        for morph in word.morphs:
            # 복합명사는 복합명사 그대로 저장
            if morph.tag == 'NNG' and prev_tag == 'NNG':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNG' and prev_tag == 'NNP':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'NNG':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'NNP':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))

            elif morph.tag == 'NNG' and prev_tag == 'XR':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'XR':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'XR' and prev_tag == 'NNG':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'XR' and prev_tag == 'NNP':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))

            elif morph.tag == 'NNG' and prev_tag == 'IC':
                morphs.append((prev_lex + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'IC':
                morphs.append((prev_lex + morph.lex, morph.tag))

            # 일반명사
            elif morph.tag == 'NNG':
                morphs.append((morph.lex, morph.tag))
            # 고유명사
            elif morph.tag == 'NNP':
                morphs.append((morph.lex, morph.tag))
            # 외국어
            elif morph.tag == 'SL':
                morphs.append((morph.lex, morph.tag))
            # 어근
            elif morph.tag == 'XR':
                morphs.append((morph.lex, morph.tag))

            # 동사 : 2자리 이상만
            elif morph.tag == 'VV' and len(morph.lex) > 1:
                morphs.append((morph.lex, morph.tag))
            # 형용사 : 2자리 이상만
            elif morph.tag == 'VA' and len(morph.lex) > 1:
                morphs.append((morph.lex, morph.tag))

            # 숫자 : 2자리 이상만
            elif morph.tag == 'SN' and len(morph.lex) > 1:
                morphs.append((morph.lex, morph.tag))
            # 숫자 + 의존명사 (예, 2000년대)
            elif morph.tag == 'NNB' and prev_tag == 'SN':
                morphs.append((prev_lex + morph.lex, morph.tag))

            prev_lex = morph.lex
            prev_tag = morph.tag

        return morphs
Ejemplo n.º 5
0
class SoraModule(Module):

    contents = [
        "그걸 말이라고 하냥!?",
        "당연하다냥!",
        "안 된다냥..",
        "언젠가는 될 거다냥!",
        "다시 한번 물어봐냥!",
        "된다냥!",
    ]

    predefined_content = {
        "당첨": ["그럴 수도 있을 것 같다냥!", "그게 된다고 생각하냥?"],
        "먹다": ["그래도 되긴 하지만... 살이 찌지 않을까냥?", "맛있겠다냥!!!ㅜ"],
        "맛있": ["맛있겠다냥!!!ㅜ", "난 싫다냥!", "그건 좋다냥!"],
        "하다": ["괜찮다냥!"],
        "사다": ["돈이 없다냥!"],
        "호불호": ["좋다냥!", "싫다냥!", "완전 좋다냥!!", "완전 싫다냥!!!"],
    }

    required_morph_types = "NVMJEXS"

    tags = [
        (
            "가",
            "VV",
            "가다",
        ),
        (
            "싶",
            "VX",
            "싶다",
        ),
        (
            "먹",
            "VV",
            "먹다",
        ),
        (
            "당첨",
            "NNG",
            "당첨",
        ),
        (
            "쫒",
            "NNG",
            "당첨",
        ),
        (
            "하",
            "VX",
            "하다",
        ),
        (
            "사",
            "NNG",
            "사다",
        ),
        (
            "맛있",
            "VA",
            "맛있다",
        ),
        (
            "좋아",
            "IC",
            "호불호",
        ),
        (
            "좋아하",
            "VV",
            "호불호",
        ),
        (
            "싫어",
            "IC",
            "호불호",
        ),
        (
            "싫어하",
            "VV",
            "호불호",
        ),
    ]

    api = KhaiiiApi()

    @classmethod
    def if_match_tag(cls, morph):
        for tag in cls.tags:
            if morph.lex == tag[0] and morph.tag == tag[1]:
                return tag
        return None

    async def on_message(self, message: discord.Message) -> bool:

        if message.content.startswith("여름아") and message.content.endswith("?"):
            ss = self.api.analyze(message.content.lstrip("여름아"))
            contexts = [
                # (required_morph_type, '가다'),
            ]

            for required_morph_type in self.required_morph_types:
                for s in ss:
                    for morph in s.morphs:
                        match = self.if_match_tag(morph)
                        if match is None:
                            continue
                        lex, tag, match = match
                        if match and required_morph_type in tag:
                            contexts.append((
                                required_morph_type,
                                match,
                            ))
                            break

            print((message.content, " ".join([str(s) for s in ss]), contexts))

            choices = [c[1] for c in contexts]
            key = random.choice(choices) if len(choices) > 0 else ""
            content = random.choice(
                self.predefined_content.get(key, self.contents))
            await message.channel.send("<@{}> {}".format(
                message.author.id, content))

        return False
Ejemplo n.º 6
0
from khaiii import KhaiiiApi


def Analyze(self, text, SEP=' + '):
    res = self.analyze(text)
    f = lambda x: x.__str__().split('\t')[1]
    return SEP.join(list(map(f, res)))


if __name__ == '__main__':
    khai3 = KhaiiiApi()
    setattr(khai3.__class__, 'Analyze', Analyze)
    print(khai3.Analyze('아버지가방에들어가신다.'))
    # 아버지/NNG + 가/JKS + 방/NNG + 에/JKB + 들어가/VV + 시/EP + ㄴ다/EF + ./SF
Ejemplo n.º 7
0
import pandas as pd  # CSV 파일을 읽고 다루기 위한 Pandas API
import unittest  # 공백 문자열을 '+" 문자로 바꾸기 위해 사용된 Unittest API

from khaiii import KhaiiiApi  # 형태소 분석기 Khaiii API
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler  # 갱신된 정보를 다루는 Updater 객체, 정해진 Command를 다루는 CommandHandler 객체, 메세지를 다루는 MessageHandler, Text를 필터링해주는 Filters 객체, 어떤 행위에 대한 사용자의 대답을 다루는 CallbackQueryHandler
from telegram import InlineKeyboardButton, InlineKeyboardMarkup  # Telegram Button Interface를 구현가능한 InlineKeyboardButton, InlineKeyboardMarkup 객체

my_token = '846490622:AAHzkPwpgOlnpJJKCn_5oWn3hcV4EIXl3-U'  # Telegram Music Kim Bot의 token 값
bot = telegram.Bot(token=my_token)  # 해당 token으로 Bot 생성

logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO)
logger = logging.getLogger(__name__)  # 기본 Logging 형식을 설정

api = KhaiiiApi()  # 형태소 분석기 KhaiiiApi()

sentiment_data = pd.read_csv(
    "KoreanSample.csv",
    encoding='CP949')  # 감성어 기분석 사전인 KoreanSample.csv 파일을 CP949 인코딩 형식으로 읽어옴
music_data = pd.read_csv(
    "SpotifyFeatures.csv"
)  # Spotify Music Big Data인 SpotifyFeatures.csv 파일을 읽어옴
final_mdf = pd.read_csv(
    "Ranged_SpotifyFeatures.csv"
)  # Clustering된 Spotify Music Big Data인 Ranged_SpotifyFeatures.csv 파일을 읽어옴


def start(bot, update):  # Bot에 /start Command 입력 혹은 Bot에 처음 시작하기를 클릭할 때
    # Button Interface를 형성하기 위해 각 Button에 Callback Data를 연결시킨 후 Markup 해줌
    show_list = []
Ejemplo n.º 8
0
from khaiii import KhaiiiApi
tokenizer = KhaiiiApi()

data = tokenizer.analyze("아버지가방에들어가신다")
tokens = []
for word in data:
    tokens.extend([str(m).split("/")[0] for m in word.morphs])
Ejemplo n.º 9
0
#!/usr/bin/python

import util
from khaiii import KhaiiiApi
import re

path = "../data/191017/1630/D_K_03"
filelist = util.get_filelist(path)

file = filelist[0]

with open(file, 'r', encoding='utf-8') as fp:
    khaiii = KhaiiiApi()
    strList = []
    while True:
        line = fp.readline()

        if not line:
            break

        if line == "\n":
            continue

        # 전처리 잘못 된 부분이 있어서 임시로 넣음. 추후 빼도 무관.
        line = re.sub("\xa0", " ", line).strip()
        if line == "":
            continue

        # 나중에 고도화 및 모듈화 해야할 곳
        ##############################################################
        #형태소 분석 전 전처리 작업 (형태소 분석에 악영향을 주는 기호 삭제)
Ejemplo n.º 10
0
def khaiii_tokenizer(sentence, tokenizer=KhaiiiApi()):
    pass
Ejemplo n.º 11
0
from khaiii import KhaiiiApi
import re

p = re.compile('\(.+\)')
p2 = re.compile('\[.+\]')

api = KhaiiiApi()

pre_data = open('pre.txt', 'r', encoding='utf-8')
post_data = open('post.txt', 'r', encoding='utf-8')
output_data = open('preprocessed_data.txt', 'w', encoding='utf-8')

pre_data_lines = pre_data.readlines()
post_data_lines = post_data.readlines()

cnt = 0
for pre_data_line in pre_data_lines:
    pre_key = ' '.join(pre_data_line.split(' ')[0:2])
    for post_data_line in post_data_lines[cnt:]:
        post_key = ' '.join(post_data_line.split(' ')[0:2])
        if pre_key == post_key:
            cnt += 1
            pre_data_line = pre_data_line.replace('\n', '')
            post_data_line = post_data_line.replace('\n', '')
            pre_sent = ' '.join(pre_data_line.split(' ')[2:])
            post_sent = ' '.join(post_data_line.split(' ')[2:])
            maches = re.findall(p, pre_sent)
            for mach in maches:
                pre_sent = pre_sent.replace(mach, '')
            maches = re.findall(p2, pre_sent)
            for mach in maches:
Ejemplo n.º 12
0
class MorphAnalyzer():
    api = KhaiiiApi()

    def morphAnalyze(self, content):
        result = list()
        #print(content,'\n')
        for word in self.api.analyze(content):
            for morph in word.morphs:
                result.append([morph.lex, morph.tag])
        return result

    def morphKeywording(self, content):
        keyword = list()
        for word in content:
            if (word[1] in ['NNG', 'NNP', 'NNB']):
                word[1] = 'NN'
        for word in content:  #단일명사가 5글자 이상인 경우
            if (word[1] == 'NN' and len(word[0]) >= 5):
                keyword.append(word[0])
        group = list()
        for k, g in groupby(content,
                            lambda x: x[1]):  # Groupby [(태그,단어),(태그,단어), ...]
            listg = [x[0] for x in list(g)]
            group.append((k, listg))
        #print("Iter Group :",group)
        for word in group:  #복합명사 추출
            if (word[0] == 'NN' and len(word[1]) >= 5):
                keyword.append(word[1])
        for index in range(len(group) - 2):  #명사+의/와/과+명사 추출 , 명사+관형사형 접미사+명사
            if (group[index][0] == 'NN' and group[index + 2][0] == 'NN'):
                if (group[index + 1][1] in ['적', '화', '의', '와', '과']):
                    keyword.append(group[index][1] + group[index + 1][1] +
                                   group[index + 2][1])
        for index in range(len(group) - 3):  #명사+감성 형용사+명사 , 명사+용언형 접미사+명사
            if (group[index][0] == 'NN' and group[index + 3][0] == 'NN'):
                if (group[index + 1][0] in ['VA', 'XSV', 'XSA']):
                    keyword.append(group[index][1] + group[index + 1][1] +
                                   group[index + 2][1] + group[index + 3][1])
        for index in range(len(group) - 2):  #감성 형용사+명사
            if (group[index][0] == 'VA' and group[index + 1][0] == 'ETM'
                    and group[index + 2][0] == 'NN'):
                keyword.append(group[index][1] + group[index + 1][1] +
                               group[index + 2][1])

        for index, word in enumerate(keyword):  #키워드 합치기
            if type(word) == list:
                merge = ''
                for i in word:
                    merge += i
                keyword[index] = merge

        del_list = list()
        append_list = list()
        for word in keyword:  #키워드 거르기
            if word[-1] in ['것', '수']:
                del_list.append(word)
            elif word[3:5] == '의원':
                del_list.append(word)
                append_list.append(word[5:])
            elif '.' in word or '․' in word or '․' in word:
                del_list.append(word)

        for word in del_list:
            keyword.remove(word)

        for word in append_list:
            keyword.append(word)

        del_list = list()
        for word in keyword:
            if len(word) < 5:
                del_list.append(word)
        for word in del_list:
            keyword.remove(word)
        return keyword
Ejemplo n.º 13
0
class NewsProcessor:
    def __init__(self,
                 news_path='/home/ir1067/FOR_TITLE/Title_33_2014_all',
                 fin_path='/home/ir1067/price_w_indicator',
                 kospi_data_path="/home/ir1067/data/kospi.csv"):
        # path
        self.fin_path = fin_path

        # .xlsx file -> news text data
        self.xlsx_list = [
            name for name in os.listdir(news_path)
            if ('.xlsx' in name) & ('#' not in name)
        ]
        self.xlsx_list.sort()

        # company names
        self.company_list = set([file[:-5] for file in self.xlsx_list])

        # .csv file -> financial data [open, close, price]
        self.csv_list = [
            name for name in os.listdir(fin_path)
            if ('.csv' in name) & ('#' not in name)
        ]
        self.csv_list.sort()

        # kospi market data [open, close]
        self.kospi = pd.read_csv(kospi_data_path).set_index('date')
        self.kospi.index = pd.to_datetime(self.kospi.index)
        self.kospi = self.kospi[['open', 'close']]
        self.kospi['open'] = [
            re.sub(',', '', text) for text in self.kospi['open']
        ]
        self.kospi['close'] = [
            re.sub(',', '', text) for text in self.kospi['close']
        ]
        self.kospi.open = self.kospi.open.astype(float)
        self.kospi.close = self.kospi.open.astype(float)
        # Khaiii API
        self.khaiii = KhaiiiApi()

        # print info
        print("Data file infomation")
        print("- News data (xlsx):\t{}".format(len(self.xlsx_list)))
        print("- Price data (csv):\t{}".format(len(self.csv_list)))
        print("- Company count:\t{}".format(len(self.company_list)))
        print("NewsProcessor init complete.")

    def get_xlsx(self, company_name):
        output = pd.DataFrame()

        # xlsx files containing company name
        data_list = [
            filename for filename in self.xlsx_list if company_name in filename
        ]

        for filename in data_list:
            news = pd.read_excel(filename, index_col=0)
            output = pd.concat([output, news])
        output.reset_index(inplace=True, drop=True)

        print("Data NaN infomation")
        for col in output.columns:
            output[col] = [
                text if text != "" else np.nan for text in output[col]
            ]
        print(output.isna().sum())

        output['date'] = pd.to_datetime(output['date'], format="%Y.%m.%d")
        output.set_index('date', drop=True, inplace=True)

        return output

    def get_csv(self, company_name):
        output = pd.DataFrame()

        # csv files containing company name
        data_list = [
            filename for filename in self.csv_list if company_name in filename
        ]

        for filename in data_list:
            price = pd.read_csv(self.fin_path + filename, index_col=0)
            output = pd.concat([output, price])

        output.index = pd.to_datetime(output.index, format="%Y.%m.%d")

        return output

    def clean_text(self, data):
        # ⓒ~ , 저작권자~, 기자~ 삭제 고려해보기

        data['title'] = [re.sub('\[.+?\]', '', text, 0, re.I | re.S).strip() \
                                for text in data['title']]
        data['title'] = [
            text if text != '' else np.nan for text in data['title']
        ]

        data['contents'] = [text.replace("// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}", "") \
                            for text in data['contents']]
        data['contents'] = [re.sub('\(.+?\)', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('{.+?}', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('\[.+?\]', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('<.+?>', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('<.+?>', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        # ▶ 이걸로 시작하는 기사가 하나 있음
        #data['contents'] = [re.sub('▶.*', '', text, 0, re.I | re.S).strip().replace(",", "") \
        #                     for text in data['contents']]

        print("Check NaN")
        print(data.isna().sum())

    def drop_empty(self, data):
        print("Data length before drop: ", len(data))
        data.dropna(inplace=True, how='any')
        data.drop(data[data['contents'] == ''].index, inplace=True)
        data = data.reset_index(drop=True)
        #index = [news.index for news in data['contents'] if news == '']
        #for i in index:
        #   data.drop([data.index[i]], inplace= True)
        print("Data length after drop:  ", len(data))

    def tokenizing(self, data, tag):
        if type(tag) == list:
            try:
                print("Start Khaiii analyze")
                after_analyze = [
                    self.khaiii.analyze(news) for news in data['contents']
                    if news != ''
                ]
                print("Done")

                tokenized = [[morph.lex for chunk in news for morph in chunk.morphs \
                       if morph.tag in tag] for news in after_analyze]
                tokenized = [
                    text if text != [] else np.nan for text in tokenized
                ]

                is_empty = [1 if text == np.nan else 0 for text in tokenized]
                print("Empty list after tokenizing: {}".format(sum(is_empty)))

                return tokenized

            except KeyError as e:
                print(e, "DataFrame does not have 'contents' column")
        else:
            print("Error: parameter 'tag' must be list")

    def tokenizing_title(self, data, tag):
        if type(tag) == list:
            try:
                print("Start Khaiii analyze")
                after_analyze = [
                    self.khaiii.analyze(news) for news in data['title']
                    if news != ''
                ]
                print("Done")

                tokenized_title = [[morph.lex for chunk in news for morph in chunk.morphs \
                       if morph.tag in tag] for news in after_analyze]
                tokenized_title = [
                    text if text != [] else np.nan for text in tokenized_title
                ]

                is_empty = [
                    1 if text == np.nan else 0 for text in tokenized_title
                ]
                print("Empty list after tokenizing: {}".format(sum(is_empty)))

                return tokenized_title

            except KeyError as e:
                print(e, "DataFrame does not have 'contents' column")
        else:
            print("Error: parameter 'tag' must be list")

    def labeling(self, data, kospi, days):
        data.index = pd.to_datetime(data.index)

        kospi.columns = ['k_open', 'k_close']
        data = pd.merge(data,
                        kospi,
                        how='right',
                        left_index=True,
                        right_index=True)
        data = data.dropna(how='any')

        for day in days:
            if day == 1:
                open = data['open']
                close = data['close']
                rtn = close / open - 1
                mkt_rtn = data['k_close'] / data['k_open'] - 1
                data['label'] = (rtn > mkt_rtn).astype(int).shift(-1)
            else:
                price = data['adj_close']
                rtn = price.pct_change(day).shift(-day - 1)
                mkt_rtn = data['k_close'].pct_change(day).shift(-day - 1)
                data['label%d' % day] = (rtn > mkt_rtn).astype(int)

        data.drop(['k_open', 'k_close'], inplace=True, axis=1)
        indicators = data[data.columns[:]]

        return indicators

    def to_datetime(self, unified_file):
        for i in range(len(unified_file.date)):
            if i % 100 == 0:
                print('processing', i)
            if len(unified_file.date[i]) == 19:
                unified_file.date[i] = unified_file.date[
                    i][:15] + '0' + unified_file.date[i][15:]

            if bool(re.search('오후', unified_file.date[i])) & bool(
                    unified_file.date[i][15:17] != '12') == True:
                unified_file.date[i] = unified_file.date[i][:15] + '{}'.format(int(unified_file.date[i][15:17]) + 12) + \
                                        unified_file.date[i][17:]

            unified_file.date[
                i] = unified_file.date[i][:12] + unified_file.date[i][15:]

        unified_file.date = pd.to_datetime(unified_file.date)
        unified_file.set_index(['date'], inplace=True)
Ejemplo n.º 14
0
sentence = u'내년도 최저임금을 기존 방식대로 전체 업종에 동일하게 적용하기로 결정했다.\
최저임금의 업종별 차등 적용을 요구해온 사용자위원들은 이에 반발해 전원회의에서 퇴장했다.\
최저임금위원회 사용자위원들은 이날 오후 정부세종청사에서 열린 최저임금위원회 제5차 전원회의 도중 퇴장해 기자들과 만나 \
"금일 최저임금위원회는 최저임금 고시에 월 환산액을 병기하고 2020년 최저임금을 모든 업종에 동일하게 적용하기로 결정했다"고 밝혔다.'
sentences = [sentence] * 10000

import time
from konlpy.tag import Hannanum, Kkma, Komoran, Okt, Mecab
from khaiii import KhaiiiApi
api = KhaiiiApi()
morphs_processors= [('Hannanum', Hannanum()), ('Kkma', Kkma()), ('Komoran', Komoran()), ('Okt', Okt()), ('mecab', Mecab())]
for name, morphs_processor in morphs_processors:
    strat_time = time.time()
    morphs = [morphs_processor.pos(sentence) for sentence in sentences]                                          
    elapsed_time = time.time() - strat_time
    print('morphs_processor name = %20s, %.5f secs' % (name, elapsed_time))
strat_time = time.time()
morphs = [api.analyze(sentence) for sentence in sentences]
elapsed_time = time.time() - strat_time
print('morphs_processor name = %20s, %.5f secs' % ('khaiii', elapsed_time))
Ejemplo n.º 15
0
print(len(data['FReview'][1]), len(extract_corpus_mecab))

# Colab에 Khaiii 설치

!git clone https://github.com/kakao/khaiii.git
!pip install cmake
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python

from khaiii import KhaiiiApi
api = KhaiiiApi()

n_tags = ['NNG', 'NNP', 'NNB', 'VV', "VA" ]
ex = data['Review'][2]
bad_ex = data['Review'][4]

def extract_corpus_khaiii(texts):
    extract_corpus = []
    for line in texts:
      if str(line) != 'nan':
        nouns = []

        for word in api.analyze(str(line)):
          for morphs in word.morphs:
            if morphs.tag in n_tags:
              nouns.append(morphs.lex)
Ejemplo n.º 16
0
from typing import List
from khaiii import KhaiiiApi

_tokenizer = KhaiiiApi()


def tokenize(본문, tagged=False) -> List:
    형태분석 = []
    if not 본문.strip():
        return 형태분석

    분석결과 = _tokenizer.analyze(본문)
    for 어절_형태분석 in 분석결과:
        for 요소 in 어절_형태분석.morphs:
            if tagged:
                형태분석.append((요소.lex, 요소.tag))
            else:
                형태분석.append(요소.lex)
    return 형태분석
Ejemplo n.º 17
0
        chosung = CHOSUNGS[chosung_index]
        joongsung = JOONGSUNGS[joongsung_index]
        jongsung = JONGSUNGS[jongsung_index]
        
        # 종성 범위 밖에 있는 것들은 end_char로 메꿔준다.
        if jongsung_index == 0:
            jongsung = end_char
        
        result.append(chosung)
        result.append(joongsung)
        result.append(jongsung)

    return "".join(result)

# khaiii
khaiii = KhaiiiApi()
def khaiii_tokenize(text):
    tokens = []
    for word in khaiii.analyze(text):
        tokens.extend([str(m).split('/')[0] for m in word.morphs])
    return tokens

# konlpy tokenizers
mecab = Mecab().morphs
okt = Okt().morphs
komoran = Komoran().morphs
hannanum = Hannanum().morphs # 오류 발생 
kkma = Kkma().morphs

def space_tokenizer(text):
    return text.split(' ')
Ejemplo n.º 18
0
def augment_data(args):

    # Option checking
    if args.no_analyzer:
        args.p_pos = 0. # disable replacement using POS tags.

    # Load original tsv file
    input_tsv = load_tsv(args.input, skip_header=False)

    if args.no_analyzer:
        sentences = []
        for text, label in tqdm(input_tsv, desc='No POS tagging'):
            sentence = []
            for token in text.split():
                tag = 'word'
                word = Word(token, tag)
                sentence.append(word)
            sentences.append((sentence, label))
    else:
        # POS tagging
        if args.analyzer == 'spacy':
            import spacy
            from spacy.symbols import ORTH
            spacy_en = spacy.load('en_core_web_sm')
            spacy_en.tokenizer.add_special_case(args.mask_token, [{ORTH: args.mask_token}])
            sentences = [(spacy_en(text), label) for text, label in tqdm(input_tsv, desc='POS tagging')]
        if args.analyzer == 'khaiii':
            from khaiii import KhaiiiApi
            khaiii_api = KhaiiiApi()
            sentences = []
            for text, label in tqdm(input_tsv, desc='POS tagging'):
                sentence = []
                khaiii_sentence = khaiii_api.analyze(text)
                for khaiii_word in khaiii_sentence:
                    for khaiii_morph in khaiii_word.morphs:
                        morph = khaiii_morph.lex
                        tag = khaiii_morph.tag
                        # we might need to modify 'morph' for matching the vocab of GloVe.
                        # ex) if tag in ['VV', 'VA', 'VX', 'XSV', 'XSA', 'VCP']: morph += u'다'
                        word = Word(morph, tag)
                        sentence.append(word)
                sentences.append((sentence, label))
        if args.analyzer == 'npc':
            sys.path.append('data/clova_sentiments_morph/npc-install/lib')
            import libpnpc as pnpc
            res_path = 'data/clova_sentiments_morph/npc-install/res'
            npc = pnpc.Index()
            npc.init(res_path)
            sentences = []
            for text, label in tqdm(input_tsv, desc='POS tagging'):
                sentence = []
                npc_sentence = npc.analyze(text)
                for item in npc_sentence:
                    meta = item['meta']
                    if meta != '[NOR]': continue
                    morph = item['morph']
                    tag = item['mtag']
                    word = Word(morph, tag)
                    sentence.append(word)
                sentences.append((sentence, label))

    if args.no_augment:
        # Write to file
        with open(args.output, 'w') as f:
            for sentence, label in tqdm(sentences, desc='Writing'):
                s = [] 
                for word in sentence:
                    s.append(word.text)
                if args.preserve_label: out_label = label
                else: out_label = args.dummy_label
                f.write("{}\t{}\n".format(' '.join(s), out_label))
        sys.exit(0)

    # Build lists of words indexes by POS
    pos_dict = {} if args.no_analyzer else build_pos_dict(sentences, lower=args.lower)

    # Generate augmented samples
    if args.parallel:
        pool = mp.Pool(mp.cpu_count())
        # processs in parallel
        entries = []
        for sentence, label in tqdm(sentences, desc='Preparation data for multiprocessing'):
            entry = {'sentence': sentence,
                     'label': label,
                     'pos_dict': pos_dict,
                     'args': args}
            entries.append(entry)
        print('Data ready! go parallel!') 
        sentences = pool.map(make_samples, entries, chunksize=100)
        sentences = reduce(lambda x,y: x+y, sentences)
        pool.close()
        pool.join()
        print('Done!')
    else:
        # process sequentially
        augmented = []
        for sentence, label in tqdm(sentences, desc='Sampling'):
            entry = {'sentence': sentence,
                     'label': label,
                     'pos_dict': pos_dict,
                     'args': args}
            samples = make_samples(entry) 
            augmented.extend(samples)
        sentences = augmented

    # Write to file
    with open(args.output, 'w') as f:
        for sentence, label in tqdm(sentences, desc='Writing'):
            if args.preserve_label: out_label = label
            else: out_label = args.dummy_label
            f.write("{}\t{}\n".format(' '.join(sentence), out_label))
class TitleMethod(Method):
    """ Title KNN Method class for playlist continuation task cold start problem.
    
    Title KNN Method.

    Attributes:
        name (str)  : name of method
        playlist2idx (dict) : playlist to index dictionary.
        title2playlist (dict)   : title to list of playlists dictionary.
        token2idx (dict)    : NLP processed token to index dictionary.
        token2title (dict)  : NLP processed token to list of titles dictionary. 
        doc2vec_model (doc2vec) : Doc2Vec Model in gensim.
        tt_matrix (sparse matirx)   : NLP processed token to tag matrix
        ts_matirx (sparse matrix)   : NLP processed token to song matrix
        api (KhaiiApi)  : Korean Tokenizer
    Return:
    """
    def __init__(self, name):
        super().__init__(name)

        self.playlist2idx = dict()
        self.title2playlist = dict()
        self.token2idx = dict()
        self.token2title = dict()

        self.unique_token = set()

        self.doc2vec_model = None

        self.tt_matrix = None
        self.ts_matrix = None

        self.api = KhaiiiApi()

    def _tokenize_title(self, title):
        """ Tokenize playlist title.
        
        Tokenize playlist title using khaiii.

        Attributes:
            title (str) : playlist title
        Return:
            token (list)   : list of "lexicon/tag" token
        """

        token = list()
        try:
            words = self.api.analyze(title)
        except KhaiiiExcept:
            words = list()

        for word in words:
            for morph in word.morphs:
                if morph.tag[:2] in ['NN', 'VV', 'VA', 'VC', 'MM', 'XR'
                                     ] or morph.tag == 'MAG':
                    token.append('/'.join([morph.lex, morph.tag]))

        return token

    def _prepare_data(self):
        """ Prepare necessary data structures for Title KNN Method.

        Prepare necessary data structures for Title KNN Method.

        """

        ### tokenize using khaiii
        ### make csr matrix (token - tag | song)
        row = {'tag': list(), 'song': list()}
        col = {'tag': list(), 'song': list()}
        data = {'tag': list(), 'song': list()}

        token_id = 0
        for title, playlist in self.title2playlist.items():

            # check wheter this title is in train dataset (not validation or test dataset)
            has_train_playlist = False
            for p in playlist:
                playlist_id = self.playlist2idx[p]
                if playlist_id < self.n_train:
                    has_train_playlist = True
                    break

            if not has_train_playlist:
                continue

            token = self._tokenize_title(title)

            for t in token:
                if t in self.token2idx:
                    token_id = self.token2idx[t]
                else:
                    self.token2idx[t] = token_id

                for p in playlist:
                    playlist_id = self.playlist2idx[p]
                    if playlist_id < self.n_train:
                        for item_id in self.pt_train[playlist_id].nonzero()[1]:
                            row['tag'].append(token_id)
                            col['tag'].append(item_id)
                            data['tag'].append(1)

                        for item_id in self.ps_train[playlist_id].nonzero()[1]:
                            row['song'].append(token_id)
                            col['song'].append(item_id)
                            data['song'].append(1)

                token_id = len(self.token2idx)

        self.tt_matrix = csr_matrix((data['tag'], (row['tag'], col['tag'])),
                                    dtype=float)
        self.ts_matrix = csr_matrix((data['song'], (row['song'], col['song'])),
                                    dtype=float)

        _, self.tt_matrix = transform_idf(self.tt_matrix)
        _, self.ts_matrix = transform_idf(self.ts_matrix)

    def _rate(self, pid, mode):
        """ Make ratings.
        
        Rate on items(tag/song) based on test data, which index is pid.
        
        Args:
            pid (int)   : playlist id in test data
            mode (str)  : determine which item. tags or songs
        Return:
            rating(numpy array): playlist and [tags or songs] rating 
        """

        assert mode in ['tags', 'songs']

        title_matrix = self.tt_matrix if mode == 'tags' else self.ts_matrix
        n = self.n_tag if mode == 'tags' else self.n_song

        idx2playlist = {
            idx: playlist
            for playlist, idx in self.playlist2idx.items()
        }
        playlist2title = dict()
        for title, playlists in self.title2playlist.items():
            for playlist in playlists:
                playlist2title[playlist] = title

        rating = np.zeros(n)

        playlist = idx2playlist[pid + self.n_train]
        title = playlist2title[playlist]
        token = self._tokenize_title(title)
        token = [t for t in token if t in self.token2idx.keys()]
        token_ids = [self.token2idx[t] for t in token]

        if len(token_ids) == 0:
            return rating

        rating = np.sum(title_matrix[token_ids, :].toarray(),
                        axis=0).reshape(-1)
        return rating

    def initialize(self,
                   n_train,
                   n_test,
                   pt_train,
                   ps_train,
                   pt_test,
                   ps_test,
                   transformer_tag,
                   transformer_song,
                   checkpoint_dir='./checkpoints'):
        """ initialize necessary variables for Method.

        initialize necessary data structure.

        Args: 
            n_train (int)   : number of playlist in train dataset.
            n_test (int)    : number of playlist in test dataset. 
            pt_train (csr_matrix)   : playlist to tag sparse matrix made from train dataset.
            ps_train (csr_matrix)   : playlist to tag sparse matrix made from train dataset.
            pt_test (csr_matrix)    : playlist to tag sparse matrix made from test dataset.
            ps_test (csr_matrix)    : playlist to song sparse matrix made from test dataset.
            transformer_tag (TfidfTransformer)  : scikit-learn TfidfTransformer model fitting pt_train.
            transformer_song (TfidfTransformer) : scikit-learn TfidfTransformer model fitting ps_train.
            checkpoint_dir (str)    : where to save similarity matrix.
        Return:
        """

        super().initialize(n_train, n_test, pt_train, ps_train, pt_test,
                           ps_test, transformer_tag, transformer_song)

        ### tokenize using khaiii
        ### make csr matrix (token - tag | song)
        row = {'tag': list(), 'song': list()}
        col = {'tag': list(), 'song': list()}
        data = {'tag': list(), 'song': list()}

        token_id = 0
        for title, playlist in self.title2playlist.items():

            # check wheter this title is in train dataset (not validation or test dataset)
            has_train_playlist = False
            for p in playlist:
                playlist_id = self.playlist2idx[p]
                if playlist_id < self.n_train:
                    has_train_playlist = True
                    break

            if not has_train_playlist:
                continue

            token = self._tokenize_title(title)

            for t in token:
                if t in self.token2idx:
                    token_id = self.token2idx[t]
                else:
                    self.token2idx[t] = token_id

                for p in playlist:
                    playlist_id = self.playlist2idx[p]
                    if playlist_id < self.n_train:
                        for item_id in self.pt_train[playlist_id].nonzero()[1]:
                            row['tag'].append(token_id)
                            col['tag'].append(item_id)
                            data['tag'].append(1)

                        for item_id in self.ps_train[playlist_id].nonzero()[1]:
                            row['song'].append(token_id)
                            col['song'].append(item_id)
                            data['song'].append(1)

                token_id = len(self.token2idx)

        self.tt_matrix = csr_matrix((data['tag'], (row['tag'], col['tag'])),
                                    dtype=float)
        self.ts_matrix = csr_matrix((data['song'], (row['song'], col['song'])),
                                    dtype=float)

        _, self.tt_matrix = transform_idf(self.tt_matrix)
        _, self.ts_matrix = transform_idf(self.ts_matrix)

    def predict(self, pid, mode):
        """ Make ratings based on mode.

        rate the playlist, which index in test sparse matrix is pid based on mode.

        Args: 
            pid (int)   : playlist id in test sparse matrix
            mode (str)  : tags or songs
        Return:
            rating (ndarray)    : playlist id and rating
        """
        rating = self._rate(pid, mode=mode)
        return rating
Ejemplo n.º 20
0
# 공백제거
for i in range(len(playlist)):
    playlist[i] = playlist[i].strip()

# # In[7]:

# # words = ['발라드','캐럴','케롤','스타워즈','뉴에이지','게임','프로필',
# #          '마쉬멜로','유산슬','감성발라드','일렉트로닉','섹시','록메탈','힐링']

# # MD.fn_add_khaiidic(words)

# # In[4]:

from khaiii import KhaiiiApi
from pprint import pprint
api = KhaiiiApi(rsc_dir='khaiii/build/share/khaiii')  # 내 설치 경로

from multiprocessing import Pool
import time


def fn_analyze_khaiii(i):
    tmp_list = []

    if len(playlist[i]) == playlist[i].count(' '):
        # 빈칸으로만 되어있는 행
        pass
    elif playlist[i].find(" ") == -1:
        # 띄어쓰기 없는 행
        tmp_list.append(playlist[i])
    else:
Ejemplo n.º 21
0
for row in test_data:
    if not row[1]:
        test_data.remove(row)


# In[8]:


from khaiii import KhaiiiApi


# In[9]:


api = KhaiiiApi()


# In[10]:


train_pos = []
for row in train_data:
    sent_pos = []
    sentence = row[1]
    for word in api.analyze(sentence):
        pos = str(word).split('\t')[1]
        sent_pos.append(pos)
    train_pos.append(sent_pos)
train_pos[:5]
from selenium import webdriver
from bs4 import BeautifulSoup
from khaiii import KhaiiiApi
from tf_idf import Tf_idf
import parsing
import contents_print
import client
from urllib.parse import urljoin
import re

api = KhaiiiApi('./khaiii/khaiii/build/lib/libkhaiii.0.4.dylib',
                './khaiii/khaiii/build/share/khaiii')
db = client.ClientDb()

max_depth = 3
url = 'https://www.mma.go.kr/'
filter_domain = 'http://www.mma.go.kr/'

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('./chrome/chromedriver_linux64/chromedriver',
                          chrome_options=chrome_options)
driver.implicitly_wait(3)
driver.set_page_load_timeout(100)

visited_pages = set()
keywords = []
df_dict = {}  # total word df
homepages = []  # total pages visited
  final.append(temp)

import pandas as pd

final = pd.DataFrame(final)
final.columns = ['#']

final.to_csv('/content/khaiii/rsc/src/preanal.manual', encoding='utf-8', index=False, header=False)

# Khaiii 사용자 사전 추가
!cd /content/khaiii/rsc
!mkdir -p /content/build/share/khaiii
!PYTHONPATH=/content/khaiii/src/main/python /content/khaiii/rsc/bin/compile_preanal.py --rsc-src=/content/khaiii/rsc/src --rsc-dir=/content/build/share/khaiii

from khaiii import KhaiiiApi
api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")

for word in api.analyze('얼그레이가 맛있습니다.'):
  for morphs in word.morphs:
    print(morphs)

data = pd.read_csv('/content/final_preprocessed_data.csv', encoding='utf-8', index_col=0)

data.head()

data.info()

"""## Experiment 1) Khaiii에서 네이버 플레이스 리뷰 명사만 추출해 Topic Modeling"""

from khaiii import KhaiiiApi
api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")
Ejemplo n.º 24
0
def khaiii():
    api = KhaiiiApi()
    words = __name__
    for word in api.analyze('안녕, 세상.'):
        words += ", " + word
    return words
Ejemplo n.º 25
0
from khaiii import KhaiiiApi
api = KhaiiiApi()

import time

print( '한글 문장 입력: ')
input_origin = str(input())

# 출력할 문장 수 
n = 10 

#timer start
start = time.time()

f = open('KCC150_K01_utf8.txt') # KCC150_K01_utf8

word = []	#형태소 list
sentence_morph_list = []	#각 문장을 형태소 분석한 결과, 출현 횟수
sentence_cnt_of_morph_appear = [] 	#각 형태소가 출현한 문서 수 

# % 출력
total_lines = 1000000
line_cnt = 0
percent = 0

#입력 문장
input_morph = dict()
for morph_list in api.analyze(input_origin):
    for m in morph_list.morphs:
        if(m.lex not in word):
            word.append(m.lex)
Ejemplo n.º 26
0
from khaiii import KhaiiiApi
import io
api = KhaiiiApi()

t = io.open('pre.txt', mode='r', encoding='utf-8')
#x = io.open('2016-10-20.index_new',mode='r', encoding='utf-8')
#nt = io.open('top5_txt',mode='w')

fin = ["EF", "SF", "EC", "ㅋ"]
label = ["NNG", "W", "MAG", "VA", "NNP"]
cnt = 0
while True:
    text = t.readline()
    total = ""
    if text == "\n":
        print()
        continue
    if "FINISH" in text:
        break
    print(cnt)
    for word in api.analyze(text):
        tmp = str(word)
        print(tmp)
        if any(format in tmp for format in label):
            chk = 0
            if any(format in tmp for format in fin):
                chk = 1
            ttmp = tmp.split()
            if chk == 1:
                ttmp[0] = ttmp[0] + "."
            total = total + " " + ttmp[0]
Ejemplo n.º 27
0
제시는 요즘 힙합트랩 비트와 알앤비가 마구 섞이는 트렌드에 딱 맞는 재목인데
예능에서만 소비되는게 아까움

톤이면 톤, 스킬이면 스킬 외국에서도 보기드문 보컬인데.. 제시제이랑 좀
비슷한거같으면서 더 찐득한

실제로 본인말로는 외국 유명 프로듀서들이 러브콜해서 진출직전이었는데 코로나사태
이후 중지되었다고.."""

parser = argparse.ArgumentParser(description='토크나이저를 테스트할 문장을 입력하세요.')
parser.add_argument('--text', required=False, default=default_text)
args = parser.parse_args()

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
khaiii = KhaiiiApi()
mecab = Mecab()
okt = Okt()
komoran = Komoran()
hannanum = Hannanum()
kkma = Kkma()

text = args.text

print("-"*5,"원본 텍스트", "-"*5)
print(text)

print("-"*5, "Mecab", "-"*5)
print(mecab.morphs(text))

print("-"*5, "Okt", "-"*5)
Ejemplo n.º 28
0
"""## Gensim Topic Modeling"""

import pandas as pd


data = pd.read_csv('/content/final_input.csv', encoding='utf-8').drop(['Unnamed: 0'], axis=1)

data.info()

"""# Gensim LDA를 위한 데이터 전처리
## Experiment 1) Khaiii에서 명사, 어근만 추출해 Tokenizing
"""

from khaiii import KhaiiiApi
api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")

n_tags = ['NNG', 'NNP', 'NNB', 'XR']#, 'VV', "VA"] # 동사도 넣고 싶으면 추가

'''
input : 추출할 Review의 list ;
output : n_tags의 tag와 일치하는 text list ; 
'''

def extract_corpus_khaiii(texts):
    extract_corpus = []
    for line in texts:
      if str(line) != 'nan':
        nouns = []

        for word in api.analyze(str(line)):
import pickle
import json
import os
import re
import random
import numpy as np
from numpy.linalg import norm
import gensim
import multiprocessing

import pandas as pd
from gensim.models import Word2Vec
from tqdm import tqdm
from khaiii import KhaiiiApi
api = KhaiiiApi()

flatten = lambda l: [item for sublist in l for item in sublist]


def make_unique_dict(tag_list):
    flatten_tags = flatten(tag_list)
    tag_counter = Counter(flatten_tags).most_common()
    df_tag_counter = pd.DataFrame(tag_counter, columns=['tags', 'freq'])
    delete_list = ['노래', '음악', '플레이리스트']
    retreval_tags = []
    merge_list = []
    for i in df_tag_counter['tags'][:673]:
        if i in delete_list:
            pass
        else:
            retreval_tags.append(i)
Ejemplo n.º 30
0
def extract_tag_from_song_title():
    MODE = "Test"
    if MODE == "Valid":
        train = pd.concat([
            pd.read_json("arena_data/orig/train.json"),
            pd.read_json("arena_data/questions/val.json")
        ],
                          ignore_index=True)
    else:
        train = pd.read_json("res/train.json")
    dev = pd.read_json("res/val.json")
    test = pd.read_json("res/test.json")

    def re_sub(series: pd.Series) -> pd.Series:
        series = series.str.replace(pat=r'[ㄱ-ㅎ]', repl=r'',
                                    regex=True)  # ㅋ 제거용
        series = series.str.replace(pat=r'[^\w\s]', repl=r'',
                                    regex=True)  # 특수문자 제거
        series = series.str.replace(pat=r'[ ]{2,}', repl=r' ',
                                    regex=True)  # 공백 제거
        series = series.str.replace(pat=r'[\u3000]+', repl=r'',
                                    regex=True)  # u3000 제거
        return series

    def flatten(list_of_list: List) -> List:
        flatten = [j for i in list_of_list for j in i]
        return flatten

    def get_token(title: str, tokenizer) -> List[Tuple]:

        if len(title) == 0 or title == ' ':  # 제목이 공백인 경우 tokenizer에러 발생
            return []

        result = tokenizer.analyze(title)
        result = [(morph.lex, morph.tag) for split in result
                  for morph in split.morphs]  # (형태소, 품사) 튜플의 리스트
        return result

    def get_all_tags(df) -> List:
        tag_list = df['tags'].values.tolist()
        tag_list = flatten(tag_list)
        return tag_list

    tokenizer = KhaiiiApi()
    all_tag = get_all_tags(pd.concat([train, dev, test]))
    token_tag = [get_token(x, tokenizer) for x in all_tag]  # 태그를 형태소 분석

    token_itself = list(filter(lambda x: len(x) == 1,
                               token_tag))  # 태그 자체가 형태소여서 분리되지 않는 태그만 골라봅니다
    token_itself = flatten(token_itself)
    flatten_token = flatten(token_tag)

    print('%-23s' % '# of original tag is', f'{len(all_tag):8,}')
    print('%-23s' % '# of morpheme itself is', f'{len(token_itself):8,}')
    print('%-23s' % '# of total token is', f'{len(flatten_token):8,}')

    train['plylst_title'] = re_sub(train['plylst_title'])
    train.loc[:, 'ply_token'] = train['plylst_title'].map(
        lambda x: get_token(x, tokenizer))

    # tag 분류표는 https://github.com/kakao/khaiii/wiki/%EC%BD%94%ED%8D%BC%EC%8A%A4 를 참고
    using_pos = ['NNG', 'SL', 'NNP', 'MAG',
                 'SN']  # 일반 명사, 외국어, 고유 명사, 일반 부사, 숫자
    train['ply_token'] = train['ply_token'].map(
        lambda x: list(filter(lambda x: x[1] in using_pos, x)))

    unique_tag = set(token_itself)
    unique_word = [x[0] for x in unique_tag]

    # 우리의 목적은 정답 tags를 맞추는 것이기 때문에 정답 tags에 나온 형태소만 남겨둡니다.
    train['ply_token'] = train['ply_token'].map(
        lambda x: list(filter(lambda x: x[0] in unique_word, x)))
    train['predict_tag'] = train['ply_token'].map(
        lambda x: [tag[0] for tag in x])
    train['predict_tag'] = train.apply(
        lambda x: [tag for tag in x.predict_tag if tag not in x.tags],
        axis=1)  # 이미 정답에 있는 건 제외

    dev['plylst_title'] = re_sub(dev['plylst_title'])
    dev.loc[:, 'ply_token'] = dev['plylst_title'].map(
        lambda x: get_token(x, tokenizer))
    dev['ply_token'] = dev['ply_token'].map(
        lambda x: list(filter(lambda x: x[1] in using_pos, x)))
    dev['ply_token'] = dev['ply_token'].map(
        lambda x: list(filter(lambda x: x[0] in unique_word, x)))
    dev['predict_tag'] = dev['ply_token'].map(lambda x: [tag[0] for tag in x])
    dev['predict_tag'] = dev.apply(
        lambda x: [tag for tag in x.predict_tag if tag not in x.tags],
        axis=1)  # 이미 정답에 있는 건 제외

    test['plylst_title'] = re_sub(test['plylst_title'])
    test.loc[:, 'ply_token'] = test['plylst_title'].map(
        lambda x: get_token(x, tokenizer))
    test['ply_token'] = test['ply_token'].map(
        lambda x: list(filter(lambda x: x[1] in using_pos, x)))
    test['ply_token'] = test['ply_token'].map(
        lambda x: list(filter(lambda x: x[0] in unique_word, x)))
    test['predict_tag'] = test['ply_token'].map(
        lambda x: [tag[0] for tag in x])
    test['predict_tag'] = test.apply(
        lambda x: [tag for tag in x.predict_tag if tag not in x.tags],
        axis=1)  # 이미 정답에 있는 건 제외

    final = []
    final_dict = dev[['id', 'predict_tag']].to_dict('index')
    final += [i for i in final_dict.values()]
    final_dict = test[['id', 'predict_tag']].to_dict('index')
    final += [i for i in final_dict.values()]
    final_dict = train[['id', 'predict_tag']].to_dict('index')
    final += [i for i in final_dict.values()]
    distutils.dir_util.mkpath("./arena_data/model")
    with open('arena_data/model/pred_tag.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(final, ensure_ascii=False))