class khaiii_complexnoun:
    def __init__(self, df):
        self.df = df
        self.lex = []
        self.tag = []
        self.api = KhaiiiApi()

    def khaiii_complex(self, filename=''):
        df = self.df

        def khaiii(sentence):
            complex_nouns = []
            for word in self.api.analyze(str(sentence)):
                for m in word.morphs:
                    self.lex.append(m.lex)
                    self.tag.append(m.tag)

            for i in range(len(self.tag) - 1):
                if self.tag[i][0] == 'N' and self.tag[i + 1][0] == 'N':
                    complex_nouns.append(self.lex[i] + ' ' + self.lex[i + 1])
                elif self.tag[i] in ['NNG', 'NNP', 'NNB', 'NR', 'NP', 'VA'
                                     ] and len(self.lex[i]) > 1:
                    complex_nouns.append(self.lex[i])

            return complex_nouns

        df['ner_khaiii_token'] = df['ner_clean'].apply(khaiii)

        if filename != '':
            df.to_csv(filename, index=False)
        return df
def data_pre(message):
    global api
    global tokenizer
    message = message.replace(' ', '')
    message = spacing(message)
    api = KhaiiiApi()
    test_tokn = api.analyze(message)
    test_sentence = ""
    for eojeol in test_tokn:
        for morph in eojeol.morphs:
            test_sentence += str(morph) + " "
    test_sentence = "[CLS] " + test_sentence + " [SEP]"
    test_sentence = tokenizer.tokenize(test_sentence)
    while ('_' in test_sentence):
        test_sentence.remove('_')
    test_sentence_ids = tokenizer.convert_tokens_to_ids(test_sentence)
    test_sentence_ids = pad_sequences([test_sentence_ids],
                                      maxlen=MAX_LEN,
                                      dtype="long",
                                      truncating="post",
                                      padding="post")
    while ('_' in test_sentence):
        test_sentence.remove('_')
    test_sentence_mask = [[float(i > 0) for i in test_sentence_ids[0]]]
    test_inputs = torch.tensor(test_sentence_ids)
    test_masks = torch.tensor(test_sentence_mask)
    return (test_inputs, test_masks)
Beispiel #3
0
def find_contents(parser, search_word):
    api = KhaiiiApi('./khaiii/khaiii/build/lib/libkhaiii.0.4.dylib',
                    './khaiii/khaiii/build/share/khaiii')
    result = ""

    search_word_with_NLP = copy.deepcopy(search_word)

    for str_sword in search_word:
        for word in api.analyze(str_sword):
            for morph in word.morphs:
                if 'NN' in morph.tag:
                    search_word_with_NLP.append(morph.lex)

    search_word_with_NLP = tuple(search_word_with_NLP)

    for title_line in parser.contents:
        titles = tuple_extract(title_line)
        if set(titles) & set(search_word_with_NLP) == set(
                search_word_with_NLP):
            result += str(parser.contents[title_line]) + '\n'

    if result == "":
        return "Fail to Find"
    else:
        return result
Beispiel #4
0
def get_noun(text):
    api = KhaiiiApi()

    anlayze = []
    for word in api.analyze(text):
        for pos in word.morphs:
            print("{0} - {1}".format(pos.lex, pos.tag))
            if pos.tag.startswith('NN') or pos.tag.startswith('SL'):
                anlayze.append(pos.lex)
    message = ''.join(anlayze)
    return str(message)
def tokenize_khaiii(sentence):
    khaiii = KhaiiiApi()

    tagged = []
    for word in khaiii.analyze(sentence):
        mos = [(m.lex, m.tag) for m in word.morphs]
        tagged.extend(mos)

    result = []
    for word, tag in tagged:
        result.append(word)

    return result
Beispiel #6
0
def khaiii_tokenize(corpus_fname, output_fname):
    api = KhaiiiApi()

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = api.analyze(sentence)
            tokenized_sent = ''
            for token in tokens:
                tokenized_sent += ' '.join([str(m)
                                            for m in token.morphs]) + ' '
            f2.writelines(tokenized_sent.strip() + '\n')
Beispiel #7
0
class Tokenizer:
    def __init__(self):
        self.api = KhaiiiApi()

    def tokenize(self, text):
        tokens = []
        for token in self.api.analyze(text):
            #print(token, type(token))
            tokens.append(token)
        return tuple(tokens)

    def check(self, text, words, check_func):
        if not check_func or not hasattr(check_func, '__call__'):
            return False
        nouns = self.get_nouns(text)
        if check_func([x for x in words if x in nouns]):
            return True
        return False

    def check_all(self, text, words):
        return self.check(text, words, check_func=all)

    def check_any(self, text, words):
        return self.check(text, words, check_func=any)

    def get_verbs(self, text, tags=['VV', 'XSV']):
        tokens = []
        for word in self.tokenize(text):
            for morph in word.morphs:
                if morph.tag in tags:
                    tokens.append(morph.lex)
        return tuple(tokens)

    def get_nouns(self, text, tags=['NNG', 'NNP']):
        tokens = []
        for word in self.tokenize(text):
            for morph in word.morphs:
                if morph.tag in tags:
                    tokens.append(morph.lex)
        return tuple(tokens)
Beispiel #8
0
class Khaiii_Tokenizer:
    def __init__(self):
        self.khaiii = KhaiiiApi()

    def tokenizer(self, sent):
        sent = sent.replace('\n', ' ')
        sent = sent.strip()
        words = []
        for word in self.khaiii.analyze(sent):
            words.extend([
                morph.lex for morph in word.morphs if morph.tag.startswith('N')
            ])
        words = [word for word in words if len(word) > 1]
        stopwords = list(
            set([
                '주룩', '아무', '이곳', '거기', '이분', '남자', '여자', '저곳', '때문', '녀석',
                '대답', '방법', '모습', '감정', '곳곳', '처음', '그녀', '자기', '가지', '본인',
                '만큼', '정말', '그때', '지금', '이름', '누구', '이때', '전날', '순간', '예전',
                '마찬가지', '오늘', '내일', '요즘', '우리', '과정', '사람', '인생', '생각', '최대한',
                '개월', '노릇', '그것', '저것', '이것', '요일', '결국', '이후', '이전', '다섯',
                '여섯', '일곱', '여덟', '아홉', '하나', '무엇', '동안', '정도', '기간'
            ]))
        words = [word for word in words if not word in stopwords]
        return words
Beispiel #9
0
def find_contents(contents_list, search_word):
    api = KhaiiiApi('./khaiii/khaiii/build/lib/libkhaiii.0.4.dylib',
                    './khaiii/khaiii/build/share/khaiii')
    result = ""

    search_word_with_NLP = copy.deepcopy(search_word)

    for str_sword in search_word:
        for word in api.analyze(str_sword):
            for morph in word.morphs:
                if 'NN' in morph.tag:
                    search_word_with_NLP.append(morph.lex)

    search_word_with_NLP = tuple(search_word_with_NLP)

    for col in contents_list:
        title_line = col["title"].replace(' ', '').split(',')
        if set(title_line) & set(search_word_with_NLP) != set():
            result += str(col["contents"])

    if result == "":
        return "Fail to Find"
    else:
        return result
Beispiel #10
0
def title_to_token(train_data):
    api = KhaiiiApi()
    title_to_token = []
    vocab = []

    for idx in range(len(train_data)):
        token_lst = []
        try:
            title = train_data[idx]['plylst_title']
            for tok in api.analyze(title):
                for morph in tok.morphs:
                    if morph.tag in [
                            'NNG', 'NNP', 'NP', 'MAG', 'MAJ', 'JKS', 'VV',
                            'VA', 'IC', 'SN', 'SL'
                    ]:
                        vocab.append(morph.lex)
                        token_lst.append(morph.lex)
        except:
            vocab.append('.')
            token_lst.append('.')

        title_to_token.append(token_lst)

    return title_to_token, vocab
from khaiii import KhaiiiApi
api = KhaiiiApi()

for i in range(10):
    s = str(input())

    res = []
    for word in api.analyze(s):
        res.append(str(word))
    res = [word.split('\t')[1] for word in res]

    print(res)
start = time.time()

f = open('KCC150_K01_utf8.txt') # KCC150_K01_utf8

word = []	#형태소 list
sentence_morph_list = []	#각 문장을 형태소 분석한 결과, 출현 횟수
sentence_cnt_of_morph_appear = [] 	#각 형태소가 출현한 문서 수 

# % 출력
total_lines = 1000000
line_cnt = 0
percent = 0

#입력 문장
input_morph = dict()
for morph_list in api.analyze(input_origin):
    for m in morph_list.morphs:
        if(m.lex not in word):
            word.append(m.lex)
            sentence_cnt_of_morph_appear.append(0)
        morph_index = word.index(m.lex)
        input_morph[morph_index] = input_morph.get(morph_index,0) + 1
        
    for key,val in input_morph.items():
         sentence_cnt_of_morph_appear[key] += 1

#word embedding, 형태소 분석
while(True):
    line_cnt += 1
    if( int(100 * line_cnt / total_lines) != percent):
        percent += 1
import pandas as pd

final = pd.DataFrame(final)
final.columns = ['#']

final.to_csv('/content/khaiii/rsc/src/preanal.manual', encoding='utf-8', index=False, header=False)

# Khaiii 사용자 사전 추가
!cd /content/khaiii/rsc
!mkdir -p /content/build/share/khaiii
!PYTHONPATH=/content/khaiii/src/main/python /content/khaiii/rsc/bin/compile_preanal.py --rsc-src=/content/khaiii/rsc/src --rsc-dir=/content/build/share/khaiii

from khaiii import KhaiiiApi
api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")

for word in api.analyze('얼그레이가 맛있습니다.'):
  for morphs in word.morphs:
    print(morphs)

data = pd.read_csv('/content/final_preprocessed_data.csv', encoding='utf-8', index_col=0)

data.head()

data.info()

"""## Experiment 1) Khaiii에서 네이버 플레이스 리뷰 명사만 추출해 Topic Modeling"""

from khaiii import KhaiiiApi
api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")

n_tags = ['NNG', 'NNP', 'NNB']#, 'VV', "VA", "XR"]
Beispiel #14
0
sentence = u'내년도 최저임금을 기존 방식대로 전체 업종에 동일하게 적용하기로 결정했다.\
최저임금의 업종별 차등 적용을 요구해온 사용자위원들은 이에 반발해 전원회의에서 퇴장했다.\
최저임금위원회 사용자위원들은 이날 오후 정부세종청사에서 열린 최저임금위원회 제5차 전원회의 도중 퇴장해 기자들과 만나 \
"금일 최저임금위원회는 최저임금 고시에 월 환산액을 병기하고 2020년 최저임금을 모든 업종에 동일하게 적용하기로 결정했다"고 밝혔다.'
sentences = [sentence] * 10000

import time
from konlpy.tag import Hannanum, Kkma, Komoran, Okt, Mecab
from khaiii import KhaiiiApi
api = KhaiiiApi()
morphs_processors= [('Hannanum', Hannanum()), ('Kkma', Kkma()), ('Komoran', Komoran()), ('Okt', Okt()), ('mecab', Mecab())]
for name, morphs_processor in morphs_processors:
    strat_time = time.time()
    morphs = [morphs_processor.pos(sentence) for sentence in sentences]                                          
    elapsed_time = time.time() - strat_time
    print('morphs_processor name = %20s, %.5f secs' % (name, elapsed_time))
strat_time = time.time()
morphs = [api.analyze(sentence) for sentence in sentences]
elapsed_time = time.time() - strat_time
print('morphs_processor name = %20s, %.5f secs' % ('khaiii', elapsed_time))
Beispiel #15
0
class Tokenizer:
    def __init__(self):
        self._api = KhaiiiApi()
        # 불용어 정의
        self._stopwords = [
            '말', '곡', '때', '음악', '노래', 'a', 'an', 'the', 'in', 'on', 'at',
            'by', 'of'
        ]

        # 대체어
        self._alternative = [
            ('k-pop', 'kpop'),
            ('k팝', 'kpop'),
            ('j-pop', 'jpop'),
            ('r&b', 'rnb'),
            ('알앤비', 'rnb'),
            ('락', 'rock'),
            ('재즈', 'jazz'),
            ('째즈', 'jazz'),
            ('힙합', 'hiphop'),
            ('hip-hop', 'hiphop'),
            ('hip-hap', 'hiphop'),
            ('클래식', 'classic'),
            ('발라드', ' 발라드 '),
            ('라붐', 'laboum'),
            ('뉴에이지', 'newage'),
        ]

    def tokenize(self, sentence):
        clean_sentence = sentence.lower()

        # 대체어로 대체
        for words in self._alternative:
            clean_sentence = re.sub(words[0], words[1], clean_sentence)

        # 영어는 소문자로, 그리고 숫자/영어/한글을 제외한 특수문자 제거(ㅋ 포함)
        clean_sentence = re.sub('[^0-9a-z가-힣]', ' ', clean_sentence)

        morphs = []
        try:
            for word in self._api.analyze(clean_sentence):
                morphs.extend(self._word_tokenize(word))
        except:
            morphs.clear()
            #print('[WARNING] Khaiii can not tokenize...({})'.format(sentence))

        # 불용어 제거
        keyword = {lex for lex, _ in morphs if not lex in self._stopwords}

        return list(keyword)

    def _word_tokenize(self, word):
        morphs = []

        prev_lex = ''
        prev_tag = ''

        for morph in word.morphs:
            # 복합명사는 복합명사 그대로 저장
            if morph.tag == 'NNG' and prev_tag == 'NNG':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNG' and prev_tag == 'NNP':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'NNG':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'NNP':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))

            elif morph.tag == 'NNG' and prev_tag == 'XR':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'XR':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'XR' and prev_tag == 'NNG':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))
            elif morph.tag == 'XR' and prev_tag == 'NNP':
                morphs.append((morphs.pop()[0] + morph.lex, morph.tag))

            elif morph.tag == 'NNG' and prev_tag == 'IC':
                morphs.append((prev_lex + morph.lex, morph.tag))
            elif morph.tag == 'NNP' and prev_tag == 'IC':
                morphs.append((prev_lex + morph.lex, morph.tag))

            # 일반명사
            elif morph.tag == 'NNG':
                morphs.append((morph.lex, morph.tag))
            # 고유명사
            elif morph.tag == 'NNP':
                morphs.append((morph.lex, morph.tag))
            # 외국어
            elif morph.tag == 'SL':
                morphs.append((morph.lex, morph.tag))
            # 어근
            elif morph.tag == 'XR':
                morphs.append((morph.lex, morph.tag))

            # 동사 : 2자리 이상만
            elif morph.tag == 'VV' and len(morph.lex) > 1:
                morphs.append((morph.lex, morph.tag))
            # 형용사 : 2자리 이상만
            elif morph.tag == 'VA' and len(morph.lex) > 1:
                morphs.append((morph.lex, morph.tag))

            # 숫자 : 2자리 이상만
            elif morph.tag == 'SN' and len(morph.lex) > 1:
                morphs.append((morph.lex, morph.tag))
            # 숫자 + 의존명사 (예, 2000년대)
            elif morph.tag == 'NNB' and prev_tag == 'SN':
                morphs.append((prev_lex + morph.lex, morph.tag))

            prev_lex = morph.lex
            prev_tag = morph.tag

        return morphs
Beispiel #16
0
from khaiii import KhaiiiApi
tokenizer = KhaiiiApi()

data = tokenizer.analyze("아버지가방에들어가신다")
tokens = []
for word in data:
    tokens.extend([str(m).split("/")[0] for m in word.morphs])
Beispiel #17
0
class Tag_parser:
    def __init__(self, soup, url):
        self.tags = []
        self.titles = {}
        self.contents = {}
        self.stopwords = ['<!', 'script', 'function', '#']
        self.api = KhaiiiApi(
            '/home/hwang/khaiii/khaiii/build/lib/libkhaiii.so.0.4',
            '/home/hwang/khaiii/khaiii/build/share/khaiii')
        #self.tables=table_reader.get_all_tables(soup)
        #print(url,self.tables)
        self.table_count = 0
        '''
        self.url = 'http://hosp.ajoumc.or.kr/MedicalInfo/HospitalRoomGuide.aspx'
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome('./chrome/chromedriver_linux64/chromedriver', chrome_options=chrome_options)
        driver.implicitly_wait(3)
        driver.set_page_load_timeout(100)
        driver.get(self.url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        '''

        self.recursiveChildren(soup)

    def isstopWord(self, args):
        for word in self.stopwords:
            if word in args or '\n' == args or str(
                    type(args)
            ) == "<class 'bs4.element.Comment'>":  # ignore comment
                return True

        return False

    def imgTagparse(self, args):
        if 'alt' in args.attrs.keys():
            return args.attrs['alt']
        else:
            return ""

    def dictvalue_to_list(self, dicts):
        res_list = []
        for v in dicts.values():
            if v is None:
                v = []
            elif str(
                    type(v)) == "<class 'bs4.element.NavigableString'>" or str(
                        type(v)) == "<class 'str'>":
                v = list(v.strip().split())
            res_list.append(tuple(v))

        return tuple(res_list)

    def extract_words(self, text):
        temp = []
        if text == '': return temp

        for word in self.api.analyze(text):
            for morph in word.morphs:
                if 'NN' in morph.tag:
                    temp.append(morph.lex)

        if len(temp):
            return temp

    def parents_name(self, link, tag):
        name = []
        if (type(link) == type('')): return False
        for p in link.parents:
            name.append(p.name)

        if (set(tag) & set(name)): return True
        else: return False

    def recursiveChildren(self, x):
        try:
            for child in x.recursiveChildGenerator():
                if self.isstopWord(child):
                    continue
                name = getattr(child, "name", None)

                if name == 'img':
                    child = self.imgTagparse(child)
                    self.tags.append(name)
                    name = None

                if name is not None:

                    if 'li' == name:
                        ##### insertion to contents dict code
                        self.titles['word_from_contents'] = self.extract_words(
                            child.get_text().strip())
                        self.contents[self.dictvalue_to_list(
                            self.titles)] = [child.get_text().strip()]
                        ######
                    # elif 'table' == name:
                    #     ############  write code here . ########################
                    #     temp_table=self.tables[self.table_count]
                    #     self.table_count+=1
                    #     if not temp_table:
                    #         continue
                    #     for line in temp_table:
                    #         try:
                    #             self.titles['word_from_contents'] = self.extract_words(line)
                    #             self.contents[self.dictvalue_to_list(self.titles)] = [line]
                    #             #print(self.titles,' : ', self.contents[self.dictvalue_to_list(self.titles)])
                    #         except:
                    #             pass
                    #     pass
                    else:
                        self.tags.append(name)
                else:
                    if child.isspace() or len(
                            self.tags
                    ) == 0 or child == '':  # lear node, don't print spaces or non-tag
                        continue
                    else:
                        if self.parents_name(child, ['li', 'table']):
                            continue

                        if 'h' in self.tags[-1] or 'img' in self.tags[
                                -1]:  # or 'span' in self.tags[-1]:  # append headline
                            if 'img' in self.tags[-1] and 'h' in self.tags[
                                    -2]:  # img tag in headline
                                self.titles[self.tags[-2]] = child
                            elif 'h' in self.tags[-1]:  # just headline
                                self.titles[self.tags[-1]] = child
                        else:
                            self.titles[
                                'word_from_contents'] = self.extract_words(
                                    child)
                            self.contents[self.dictvalue_to_list(
                                self.titles)] = [
                                    child.strip()
                                ]  # set contents {title : contents}
                            print(
                                self.titles, ' : ',
                                self.contents[self.dictvalue_to_list(
                                    self.titles)])

                    if len(self.tags):
                        self.tags.pop(-1)

        except Exception as ex:
            print("error ", ex)
            return
Beispiel #18
0
class NewsProcessor:
    def __init__(self,
                 news_path='/home/ir1067/FOR_TITLE/Title_33_2014_all',
                 fin_path='/home/ir1067/price_w_indicator',
                 kospi_data_path="/home/ir1067/data/kospi.csv"):
        # path
        self.fin_path = fin_path

        # .xlsx file -> news text data
        self.xlsx_list = [
            name for name in os.listdir(news_path)
            if ('.xlsx' in name) & ('#' not in name)
        ]
        self.xlsx_list.sort()

        # company names
        self.company_list = set([file[:-5] for file in self.xlsx_list])

        # .csv file -> financial data [open, close, price]
        self.csv_list = [
            name for name in os.listdir(fin_path)
            if ('.csv' in name) & ('#' not in name)
        ]
        self.csv_list.sort()

        # kospi market data [open, close]
        self.kospi = pd.read_csv(kospi_data_path).set_index('date')
        self.kospi.index = pd.to_datetime(self.kospi.index)
        self.kospi = self.kospi[['open', 'close']]
        self.kospi['open'] = [
            re.sub(',', '', text) for text in self.kospi['open']
        ]
        self.kospi['close'] = [
            re.sub(',', '', text) for text in self.kospi['close']
        ]
        self.kospi.open = self.kospi.open.astype(float)
        self.kospi.close = self.kospi.open.astype(float)
        # Khaiii API
        self.khaiii = KhaiiiApi()

        # print info
        print("Data file infomation")
        print("- News data (xlsx):\t{}".format(len(self.xlsx_list)))
        print("- Price data (csv):\t{}".format(len(self.csv_list)))
        print("- Company count:\t{}".format(len(self.company_list)))
        print("NewsProcessor init complete.")

    def get_xlsx(self, company_name):
        output = pd.DataFrame()

        # xlsx files containing company name
        data_list = [
            filename for filename in self.xlsx_list if company_name in filename
        ]

        for filename in data_list:
            news = pd.read_excel(filename, index_col=0)
            output = pd.concat([output, news])
        output.reset_index(inplace=True, drop=True)

        print("Data NaN infomation")
        for col in output.columns:
            output[col] = [
                text if text != "" else np.nan for text in output[col]
            ]
        print(output.isna().sum())

        output['date'] = pd.to_datetime(output['date'], format="%Y.%m.%d")
        output.set_index('date', drop=True, inplace=True)

        return output

    def get_csv(self, company_name):
        output = pd.DataFrame()

        # csv files containing company name
        data_list = [
            filename for filename in self.csv_list if company_name in filename
        ]

        for filename in data_list:
            price = pd.read_csv(self.fin_path + filename, index_col=0)
            output = pd.concat([output, price])

        output.index = pd.to_datetime(output.index, format="%Y.%m.%d")

        return output

    def clean_text(self, data):
        # ⓒ~ , 저작권자~, 기자~ 삭제 고려해보기

        data['title'] = [re.sub('\[.+?\]', '', text, 0, re.I | re.S).strip() \
                                for text in data['title']]
        data['title'] = [
            text if text != '' else np.nan for text in data['title']
        ]

        data['contents'] = [text.replace("// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}", "") \
                            for text in data['contents']]
        data['contents'] = [re.sub('\(.+?\)', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('{.+?}', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('\[.+?\]', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('<.+?>', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        data['contents'] = [re.sub('<.+?>', '', text, 0, re.I | re.S).strip() \
                            for text in data['contents']]
        # ▶ 이걸로 시작하는 기사가 하나 있음
        #data['contents'] = [re.sub('▶.*', '', text, 0, re.I | re.S).strip().replace(",", "") \
        #                     for text in data['contents']]

        print("Check NaN")
        print(data.isna().sum())

    def drop_empty(self, data):
        print("Data length before drop: ", len(data))
        data.dropna(inplace=True, how='any')
        data.drop(data[data['contents'] == ''].index, inplace=True)
        data = data.reset_index(drop=True)
        #index = [news.index for news in data['contents'] if news == '']
        #for i in index:
        #   data.drop([data.index[i]], inplace= True)
        print("Data length after drop:  ", len(data))

    def tokenizing(self, data, tag):
        if type(tag) == list:
            try:
                print("Start Khaiii analyze")
                after_analyze = [
                    self.khaiii.analyze(news) for news in data['contents']
                    if news != ''
                ]
                print("Done")

                tokenized = [[morph.lex for chunk in news for morph in chunk.morphs \
                       if morph.tag in tag] for news in after_analyze]
                tokenized = [
                    text if text != [] else np.nan for text in tokenized
                ]

                is_empty = [1 if text == np.nan else 0 for text in tokenized]
                print("Empty list after tokenizing: {}".format(sum(is_empty)))

                return tokenized

            except KeyError as e:
                print(e, "DataFrame does not have 'contents' column")
        else:
            print("Error: parameter 'tag' must be list")

    def tokenizing_title(self, data, tag):
        if type(tag) == list:
            try:
                print("Start Khaiii analyze")
                after_analyze = [
                    self.khaiii.analyze(news) for news in data['title']
                    if news != ''
                ]
                print("Done")

                tokenized_title = [[morph.lex for chunk in news for morph in chunk.morphs \
                       if morph.tag in tag] for news in after_analyze]
                tokenized_title = [
                    text if text != [] else np.nan for text in tokenized_title
                ]

                is_empty = [
                    1 if text == np.nan else 0 for text in tokenized_title
                ]
                print("Empty list after tokenizing: {}".format(sum(is_empty)))

                return tokenized_title

            except KeyError as e:
                print(e, "DataFrame does not have 'contents' column")
        else:
            print("Error: parameter 'tag' must be list")

    def labeling(self, data, kospi, days):
        data.index = pd.to_datetime(data.index)

        kospi.columns = ['k_open', 'k_close']
        data = pd.merge(data,
                        kospi,
                        how='right',
                        left_index=True,
                        right_index=True)
        data = data.dropna(how='any')

        for day in days:
            if day == 1:
                open = data['open']
                close = data['close']
                rtn = close / open - 1
                mkt_rtn = data['k_close'] / data['k_open'] - 1
                data['label'] = (rtn > mkt_rtn).astype(int).shift(-1)
            else:
                price = data['adj_close']
                rtn = price.pct_change(day).shift(-day - 1)
                mkt_rtn = data['k_close'].pct_change(day).shift(-day - 1)
                data['label%d' % day] = (rtn > mkt_rtn).astype(int)

        data.drop(['k_open', 'k_close'], inplace=True, axis=1)
        indicators = data[data.columns[:]]

        return indicators

    def to_datetime(self, unified_file):
        for i in range(len(unified_file.date)):
            if i % 100 == 0:
                print('processing', i)
            if len(unified_file.date[i]) == 19:
                unified_file.date[i] = unified_file.date[
                    i][:15] + '0' + unified_file.date[i][15:]

            if bool(re.search('오후', unified_file.date[i])) & bool(
                    unified_file.date[i][15:17] != '12') == True:
                unified_file.date[i] = unified_file.date[i][:15] + '{}'.format(int(unified_file.date[i][15:17]) + 12) + \
                                        unified_file.date[i][17:]

            unified_file.date[
                i] = unified_file.date[i][:12] + unified_file.date[i][15:]

        unified_file.date = pd.to_datetime(unified_file.date)
        unified_file.set_index(['date'], inplace=True)
Beispiel #19
0
     pre_sent = pre_sent.replace(mach, '')
 maches = re.findall(p2, pre_sent)
 for mach in maches:
     pre_sent = pre_sent.replace(mach, '')
 maches = re.findall(p, post_sent)
 for mach in maches:
     post_sent = post_sent.replace(mach, '')
 maches = re.findall(p2, post_sent)
 for mach in maches:
     post_sent = post_sent.replace(mach, '')
 output_sent = ''
 pre_sent = pre_sent.strip()
 post_sent = post_sent.strip()
 if len(pre_sent) < 1 or len(post_sent) < 1:
     continue
 pre_words = api.analyze(pre_sent)
 post_words = api.analyze(post_sent)
 pre_sent = ''
 for word in pre_words:
     for morph in word.morphs:
         if morph.tag == 'SL':
             continue
         text = morph.lex
         pre_sent += text
         pre_sent += ' '
 pre_sent = pre_sent.strip()
 if len(pre_sent) < 1:
     continue
 post_sent = ''
 for word in post_words:
     for morph in word.morphs:
Beispiel #20
0
#x = io.open('2016-10-20.index_new',mode='r', encoding='utf-8')
#nt = io.open('top5_txt',mode='w')

fin = ["EF", "SF", "EC", "ㅋ"]
label = ["NNG", "W", "MAG", "VA", "NNP"]
cnt = 0
while True:
    text = t.readline()
    total = ""
    if text == "\n":
        print()
        continue
    if "FINISH" in text:
        break
    print(cnt)
    for word in api.analyze(text):
        tmp = str(word)
        print(tmp)
        if any(format in tmp for format in label):
            chk = 0
            if any(format in tmp for format in fin):
                chk = 1
            ttmp = tmp.split()
            if chk == 1:
                ttmp[0] = ttmp[0] + "."
            total = total + " " + ttmp[0]
    print()
    print()
    cnt += 1
#    print(total)
Beispiel #21
0
def khaiii():
    api = KhaiiiApi()
    words = __name__
    for word in api.analyze('안녕, 세상.'):
        words += ", " + word
    return words
def augment_data(args):

    # Option checking
    if args.no_analyzer:
        args.p_pos = 0. # disable replacement using POS tags.

    # Load original tsv file
    input_tsv = load_tsv(args.input, skip_header=False)

    if args.no_analyzer:
        sentences = []
        for text, label in tqdm(input_tsv, desc='No POS tagging'):
            sentence = []
            for token in text.split():
                tag = 'word'
                word = Word(token, tag)
                sentence.append(word)
            sentences.append((sentence, label))
    else:
        # POS tagging
        if args.analyzer == 'spacy':
            import spacy
            from spacy.symbols import ORTH
            spacy_en = spacy.load('en_core_web_sm')
            spacy_en.tokenizer.add_special_case(args.mask_token, [{ORTH: args.mask_token}])
            sentences = [(spacy_en(text), label) for text, label in tqdm(input_tsv, desc='POS tagging')]
        if args.analyzer == 'khaiii':
            from khaiii import KhaiiiApi
            khaiii_api = KhaiiiApi()
            sentences = []
            for text, label in tqdm(input_tsv, desc='POS tagging'):
                sentence = []
                khaiii_sentence = khaiii_api.analyze(text)
                for khaiii_word in khaiii_sentence:
                    for khaiii_morph in khaiii_word.morphs:
                        morph = khaiii_morph.lex
                        tag = khaiii_morph.tag
                        # we might need to modify 'morph' for matching the vocab of GloVe.
                        # ex) if tag in ['VV', 'VA', 'VX', 'XSV', 'XSA', 'VCP']: morph += u'다'
                        word = Word(morph, tag)
                        sentence.append(word)
                sentences.append((sentence, label))
        if args.analyzer == 'npc':
            sys.path.append('data/clova_sentiments_morph/npc-install/lib')
            import libpnpc as pnpc
            res_path = 'data/clova_sentiments_morph/npc-install/res'
            npc = pnpc.Index()
            npc.init(res_path)
            sentences = []
            for text, label in tqdm(input_tsv, desc='POS tagging'):
                sentence = []
                npc_sentence = npc.analyze(text)
                for item in npc_sentence:
                    meta = item['meta']
                    if meta != '[NOR]': continue
                    morph = item['morph']
                    tag = item['mtag']
                    word = Word(morph, tag)
                    sentence.append(word)
                sentences.append((sentence, label))

    if args.no_augment:
        # Write to file
        with open(args.output, 'w') as f:
            for sentence, label in tqdm(sentences, desc='Writing'):
                s = [] 
                for word in sentence:
                    s.append(word.text)
                if args.preserve_label: out_label = label
                else: out_label = args.dummy_label
                f.write("{}\t{}\n".format(' '.join(s), out_label))
        sys.exit(0)

    # Build lists of words indexes by POS
    pos_dict = {} if args.no_analyzer else build_pos_dict(sentences, lower=args.lower)

    # Generate augmented samples
    if args.parallel:
        pool = mp.Pool(mp.cpu_count())
        # processs in parallel
        entries = []
        for sentence, label in tqdm(sentences, desc='Preparation data for multiprocessing'):
            entry = {'sentence': sentence,
                     'label': label,
                     'pos_dict': pos_dict,
                     'args': args}
            entries.append(entry)
        print('Data ready! go parallel!') 
        sentences = pool.map(make_samples, entries, chunksize=100)
        sentences = reduce(lambda x,y: x+y, sentences)
        pool.close()
        pool.join()
        print('Done!')
    else:
        # process sequentially
        augmented = []
        for sentence, label in tqdm(sentences, desc='Sampling'):
            entry = {'sentence': sentence,
                     'label': label,
                     'pos_dict': pos_dict,
                     'args': args}
            samples = make_samples(entry) 
            augmented.extend(samples)
        sentences = augmented

    # Write to file
    with open(args.output, 'w') as f:
        for sentence, label in tqdm(sentences, desc='Writing'):
            if args.preserve_label: out_label = label
            else: out_label = args.dummy_label
            f.write("{}\t{}\n".format(' '.join(sentence), out_label))
Beispiel #23
0
# -*- coding: utf-8 -*-

from khaiii import KhaiiiApi
api = KhaiiiApi()

result = []
for word in api.analyze("스벅 아메리카노 한잔 주세요"):
    aWord = dict()

    word_list = []
    for m in word.morphs:
        morph = dict()
        morph['lex'] = m.lex
        morph['tag'] = m.tag
        word_list.append(morph)
    aWord['list'] = word_list
    aWord['word'] = word.lex
    # attrs = vars(word)
    # for item in attrs.items():
    result.append(aWord)

print(result)
class TitleMethod(Method):
    """ Title KNN Method class for playlist continuation task cold start problem.
    
    Title KNN Method.

    Attributes:
        name (str)  : name of method
        playlist2idx (dict) : playlist to index dictionary.
        title2playlist (dict)   : title to list of playlists dictionary.
        token2idx (dict)    : NLP processed token to index dictionary.
        token2title (dict)  : NLP processed token to list of titles dictionary. 
        doc2vec_model (doc2vec) : Doc2Vec Model in gensim.
        tt_matrix (sparse matirx)   : NLP processed token to tag matrix
        ts_matirx (sparse matrix)   : NLP processed token to song matrix
        api (KhaiiApi)  : Korean Tokenizer
    Return:
    """
    def __init__(self, name):
        super().__init__(name)

        self.playlist2idx = dict()
        self.title2playlist = dict()
        self.token2idx = dict()
        self.token2title = dict()

        self.unique_token = set()

        self.doc2vec_model = None

        self.tt_matrix = None
        self.ts_matrix = None

        self.api = KhaiiiApi()

    def _tokenize_title(self, title):
        """ Tokenize playlist title.
        
        Tokenize playlist title using khaiii.

        Attributes:
            title (str) : playlist title
        Return:
            token (list)   : list of "lexicon/tag" token
        """

        token = list()
        try:
            words = self.api.analyze(title)
        except KhaiiiExcept:
            words = list()

        for word in words:
            for morph in word.morphs:
                if morph.tag[:2] in ['NN', 'VV', 'VA', 'VC', 'MM', 'XR'
                                     ] or morph.tag == 'MAG':
                    token.append('/'.join([morph.lex, morph.tag]))

        return token

    def _prepare_data(self):
        """ Prepare necessary data structures for Title KNN Method.

        Prepare necessary data structures for Title KNN Method.

        """

        ### tokenize using khaiii
        ### make csr matrix (token - tag | song)
        row = {'tag': list(), 'song': list()}
        col = {'tag': list(), 'song': list()}
        data = {'tag': list(), 'song': list()}

        token_id = 0
        for title, playlist in self.title2playlist.items():

            # check wheter this title is in train dataset (not validation or test dataset)
            has_train_playlist = False
            for p in playlist:
                playlist_id = self.playlist2idx[p]
                if playlist_id < self.n_train:
                    has_train_playlist = True
                    break

            if not has_train_playlist:
                continue

            token = self._tokenize_title(title)

            for t in token:
                if t in self.token2idx:
                    token_id = self.token2idx[t]
                else:
                    self.token2idx[t] = token_id

                for p in playlist:
                    playlist_id = self.playlist2idx[p]
                    if playlist_id < self.n_train:
                        for item_id in self.pt_train[playlist_id].nonzero()[1]:
                            row['tag'].append(token_id)
                            col['tag'].append(item_id)
                            data['tag'].append(1)

                        for item_id in self.ps_train[playlist_id].nonzero()[1]:
                            row['song'].append(token_id)
                            col['song'].append(item_id)
                            data['song'].append(1)

                token_id = len(self.token2idx)

        self.tt_matrix = csr_matrix((data['tag'], (row['tag'], col['tag'])),
                                    dtype=float)
        self.ts_matrix = csr_matrix((data['song'], (row['song'], col['song'])),
                                    dtype=float)

        _, self.tt_matrix = transform_idf(self.tt_matrix)
        _, self.ts_matrix = transform_idf(self.ts_matrix)

    def _rate(self, pid, mode):
        """ Make ratings.
        
        Rate on items(tag/song) based on test data, which index is pid.
        
        Args:
            pid (int)   : playlist id in test data
            mode (str)  : determine which item. tags or songs
        Return:
            rating(numpy array): playlist and [tags or songs] rating 
        """

        assert mode in ['tags', 'songs']

        title_matrix = self.tt_matrix if mode == 'tags' else self.ts_matrix
        n = self.n_tag if mode == 'tags' else self.n_song

        idx2playlist = {
            idx: playlist
            for playlist, idx in self.playlist2idx.items()
        }
        playlist2title = dict()
        for title, playlists in self.title2playlist.items():
            for playlist in playlists:
                playlist2title[playlist] = title

        rating = np.zeros(n)

        playlist = idx2playlist[pid + self.n_train]
        title = playlist2title[playlist]
        token = self._tokenize_title(title)
        token = [t for t in token if t in self.token2idx.keys()]
        token_ids = [self.token2idx[t] for t in token]

        if len(token_ids) == 0:
            return rating

        rating = np.sum(title_matrix[token_ids, :].toarray(),
                        axis=0).reshape(-1)
        return rating

    def initialize(self,
                   n_train,
                   n_test,
                   pt_train,
                   ps_train,
                   pt_test,
                   ps_test,
                   transformer_tag,
                   transformer_song,
                   checkpoint_dir='./checkpoints'):
        """ initialize necessary variables for Method.

        initialize necessary data structure.

        Args: 
            n_train (int)   : number of playlist in train dataset.
            n_test (int)    : number of playlist in test dataset. 
            pt_train (csr_matrix)   : playlist to tag sparse matrix made from train dataset.
            ps_train (csr_matrix)   : playlist to tag sparse matrix made from train dataset.
            pt_test (csr_matrix)    : playlist to tag sparse matrix made from test dataset.
            ps_test (csr_matrix)    : playlist to song sparse matrix made from test dataset.
            transformer_tag (TfidfTransformer)  : scikit-learn TfidfTransformer model fitting pt_train.
            transformer_song (TfidfTransformer) : scikit-learn TfidfTransformer model fitting ps_train.
            checkpoint_dir (str)    : where to save similarity matrix.
        Return:
        """

        super().initialize(n_train, n_test, pt_train, ps_train, pt_test,
                           ps_test, transformer_tag, transformer_song)

        ### tokenize using khaiii
        ### make csr matrix (token - tag | song)
        row = {'tag': list(), 'song': list()}
        col = {'tag': list(), 'song': list()}
        data = {'tag': list(), 'song': list()}

        token_id = 0
        for title, playlist in self.title2playlist.items():

            # check wheter this title is in train dataset (not validation or test dataset)
            has_train_playlist = False
            for p in playlist:
                playlist_id = self.playlist2idx[p]
                if playlist_id < self.n_train:
                    has_train_playlist = True
                    break

            if not has_train_playlist:
                continue

            token = self._tokenize_title(title)

            for t in token:
                if t in self.token2idx:
                    token_id = self.token2idx[t]
                else:
                    self.token2idx[t] = token_id

                for p in playlist:
                    playlist_id = self.playlist2idx[p]
                    if playlist_id < self.n_train:
                        for item_id in self.pt_train[playlist_id].nonzero()[1]:
                            row['tag'].append(token_id)
                            col['tag'].append(item_id)
                            data['tag'].append(1)

                        for item_id in self.ps_train[playlist_id].nonzero()[1]:
                            row['song'].append(token_id)
                            col['song'].append(item_id)
                            data['song'].append(1)

                token_id = len(self.token2idx)

        self.tt_matrix = csr_matrix((data['tag'], (row['tag'], col['tag'])),
                                    dtype=float)
        self.ts_matrix = csr_matrix((data['song'], (row['song'], col['song'])),
                                    dtype=float)

        _, self.tt_matrix = transform_idf(self.tt_matrix)
        _, self.ts_matrix = transform_idf(self.ts_matrix)

    def predict(self, pid, mode):
        """ Make ratings based on mode.

        rate the playlist, which index in test sparse matrix is pid based on mode.

        Args: 
            pid (int)   : playlist id in test sparse matrix
            mode (str)  : tags or songs
        Return:
            rating (ndarray)    : playlist id and rating
        """
        rating = self._rate(pid, mode=mode)
        return rating
text = args.text

print("-"*5,"원본 텍스트", "-"*5)
print(text)

print("-"*5, "Mecab", "-"*5)
print(mecab.morphs(text))

print("-"*5, "Okt", "-"*5)
print(okt.morphs(text))

print("-"*5, "Komoran", "-"*5)
print(komoran.morphs(text))

print("-"*5, "Hannanum", "-"*5)
print(hannanum.morphs(text))

print("-"*5, "Kkma", "-"*5)
print(kkma.morphs(text))

print("-"*5, "Khaiii", "-"*5)
tokens = []
for word in khaiii.analyze(text):
    tokens.extend([str(m).split('/')[0] for m in word.morphs])
print(tokens)

print("-"*5, "bert-base-multilingual-cased", "-"*5)
print(tokenizer.tokenize(text))



# In[9]:


api = KhaiiiApi()


# In[10]:


train_pos = []
for row in train_data:
    sent_pos = []
    sentence = row[1]
    for word in api.analyze(sentence):
        pos = str(word).split('\t')[1]
        sent_pos.append(pos)
    train_pos.append(sent_pos)
train_pos[:5]


# In[11]:


test_pos = []
for row in test_data:
    sent_pos = []
    sentence = row[1]
    for word in api.analyze(sentence):
        pos = str(word).split('\t')[1]
Beispiel #27
0
          for morphs in word.morphs:
            if morphs.tag in n_tags:
              if len(morphs.lex) > 1:
                nouns.append(morphs.lex)
              else:
                continue

        extract_corpus.append(nouns)

    return extract_corpus

data.Review[0]

n_tags = ['NNG', 'NNP', 'NNB']
api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")
for word in api.analyze("유자 마카롱에 상큼한 필링이 가득했고, 초코 마카롱에 초코칩이 잔뜩 박혀 식감이 좋았어요"):
    for morph in word.morphs:
      if morph.tag in n_tags:
        print(morph)

# 네이버 영수증 리뷰, 인스타 리뷰에서 명사, 어근을 추출
khaiii_xr = extract_corpus_khaiii(data['Review']) 

#khaiii_all = extract_corpus_khaiii(data['Review']) # 명사, 동사, 형용사, 어근 모두 추출

# 추출 전 데이터와 비교

print(len(data), len(khaiii_xr))
#print(len(data), len(khaiii_all))

import gensim
Beispiel #28
0
        if line == "\n":
            continue

        # 전처리 잘못 된 부분이 있어서 임시로 넣음. 추후 빼도 무관.
        line = re.sub("\xa0", " ", line).strip()
        if line == "":
            continue

        # 나중에 고도화 및 모듈화 해야할 곳
        ##############################################################
        #형태소 분석 전 전처리 작업 (형태소 분석에 악영향을 주는 기호 삭제)
        line = line.replace("'", "").replace('"', "").replace("‘", "") \
            .replace("’", "").replace("“", "").replace('”', "").replace \
            ('·', " ")
        words = khaiii.analyze(line)
        for word in words:
            # print(word)
            strList.append("{0}".format(word))

# 나중에 고도화 및 모듈화 해야할 곳
#############################################################################
# unit(의미 있는 어절 단위; 자체정의)
# unit 을 담을 unitList 생성 및 unit parsing 작업 시작
unitList = []
for i, line in enumerate(strList):
    morpParcelList = line.split("\t")[1].replace('"',
                                                 '').replace("'",
                                                             "").split(" + ")

    morpMetaList = []
Beispiel #29
0
from khaiii import KhaiiiApi
api = KhaiiiApi()
f = open('./input3.txt','r')
time = f.readline()
press = f.readline()
print(time,press)
total = []
cnt=0
while True:
    line = f.readline()
    if not line: break
    for word in api.analyze(line):
        words = str(word).split()
        for tmp in words:
            if 'NNG' in tmp:
                cnt=cnt+1
                ttmp = tmp.split('/')
                total.append(ttmp[0])
print(total)

Beispiel #30
0
        extract_corpus.append(nouns)

    return extract_corpus

api = KhaiiiApi()

n_tags = ['NNG', 'NNP', 'NNB', 'VV', "VA" ]


extract_corpus = []
for line in data.FReview[0]:
  if str(line) != 'nan':
    nouns = []

    for word in api.analyze(str(line)):
      for morphs in word.morphs:
        if morphs.tag in n_tags:
          nouns.append(morphs.lex)

    extract_corpus.append(nouns)

data.FReview[1]

extract_corpus_khaiii = extract_corpus_khaiii(data['FReview'][1])

print(len(extract_corpus_mecab), len(extract_corpus_khaiii))

data.Review[0]

extract_corpus_mecab[:10]