Beispiel #1
0
## Mecab
"""

!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

cd Mecab-ko-for-Google-Colab

ls

!bash install_mecab-ko_on_colab190912.sh

from konlpy.tag import Mecab

mecab = Mecab()
print(mecab.morphs('노트북 사고 싶다'))

"""## 데이터 로드"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt", filename="ratings_total.txt")

total_data = pd.read_table('ratings_total.txt', names=['ratings', 'reviews'])
Beispiel #2
0
from konlpy.tag import Mecab, Okt, Kkma, Twitter, Komoran

#Mecab 현존 가장 빠름 ( > Twitter )
#Oak Twitter에서 이름이 변경됨
#KKma 정확한 품사 정보를 추출
#Komoran 정확성, 시간 모두 중요할때
text = "한글 자연어 처리는 재밌다 이제부터 열심히 해야지 ㅎㅎㅎ"
mecab = Mecab()
mecab.morphs(text)

kkma = Kkma()
kkma.morphs(text)

komoran = Komoran()
komoran.morphs(text)

okt = Okt()
okt.morphs(text, stem=True)
okt.pos(text, stem=True) #스테밍해서 품사태깅
okt.pos(text, join=True) #품사태깅을 붙여서 리스트화




Beispiel #3
0
def predict(text):
    top_k = 5
    saved_data = torch.load(
        'kcbert_novalid_9.pth', map_location='cpu')
        # map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id

    train_config = saved_data['config']
    bert_best = saved_data['bert']
    index_to_label = saved_data['classes']

    lines = read_text(text)
    mecab = Mecab()
    mc_lines = mecab.morphs(lines[0])
    texts = ' '.join(mc_lines)
    print(texts)
    

    with torch.no_grad():
        # Declare model and load pre-trained weights.
        tokenizer = AutoTokenizer.from_pretrained(train_config.pretrained_model_name)
        model = BertForSequenceClassification.from_pretrained(
            train_config.pretrained_model_name,
            num_labels=len(index_to_label)
        )
        model.load_state_dict(bert_best)

        if gpu_id >= 0:
            model.cuda(gpu_id)
        device = next(model.parameters()).device

        # Don't forget turn-on evaluation mode.
        model.eval()

        mini_batch = tokenizer(
            [texts],
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

        x = mini_batch['input_ids']
        x = x.to(device)
        mask = mini_batch['attention_mask']
        mask = mask.to(device)

        # Take feed-forward
        y_hat = model(x, attention_mask=mask)[0]
        # print(y_hat)
        probs = nn.Softmax(dim=1)
        softmax_output = probs(y_hat)
        print(softmax_output)

        probs, indice = softmax_output.cpu().topk(top_k)
        # |indice| = (len(lines), top_k)
        print(indice)
        print(probs)

        results = [index_to_label[int(indice[0][j])] for j in range(top_k)]
        probs = [round(probs[0][j].item(),4) for j in range(top_k)]
        res = list(zip(results,probs))

    return res
Beispiel #4
0
class WordTokenizer(Tokenizer):
    """
    Word Tokenizer

    * Args:
        name: tokenizer name [treebank_en|spacy_en|mecab_ko|bert_basic]

    * Kwargs:
        flatten: return type as flatten list
        split_with_regex: post split action. Split tokens that the tokenizer cannot split.
    """
    def __init__(self, name, sent_tokenizer, config={}, split_with_regex=True):
        super(WordTokenizer,
              self).__init__(name, f"word-{name}+{sent_tokenizer.cache_name}")
        self.config = config
        self.sent_tokenizer = sent_tokenizer
        self.word_tokenizer = None

        self.split_with_regex = split_with_regex
        if split_with_regex:
            self.extra_split_chars_re = self.make_split_regex_expression()

    def make_split_regex_expression(self):
        """
        Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens
        due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see
        if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA
        """
        extra_split_chars = (
            "-",
            "£",
            "€",
            "¥",
            "¢",
            "₹",
            "*",
            "\u2212",
            "\u2014",
            "\u2013",
            "/",
            "~",
            '"',
            "'",
            "\ud01C",
            "\u2019",
            "\u201D",
            "\u2018",
            "\u00B0",
            ".",
            ":",
        )
        extra_split_tokens = (
            "``",
            "(?<=[^_])_(?=[^_])",  # dashes w/o a preceeding or following dash, so __wow___ -> ___ wow ___
            "''",
            "[" + "".join(extra_split_chars) + "]",
        )
        return re.compile("(" + "|".join(extra_split_tokens) + ")")

    @overrides
    def _tokenize(self, text, unit="text"):
        """ Text -> word tokens """
        if type(text) != str:
            raise ValueError(f"text type is must be str. not {type(text)}")

        if unit == "sentence":
            tokens = getattr(self, f"_{self.name}")(text)
        else:
            sentences = self.sent_tokenizer.tokenize(text)
            tokens = [
                getattr(self, f"_{self.name}")(sentence)
                for sentence in sentences
            ]

        if self.split_with_regex and self.name != "spacy_en":
            tokens = self._split_with_regex(tokens)

        return list(common_utils.flatten(tokens))

    def _split_with_regex(self, sentences):
        for i, sentence in enumerate(sentences):
            sentences[i] = [
                token for token in self._post_split_tokens(sentence)
            ]
        return sentences

    def _post_split_tokens(self, tokens):
        return [[x for x in self.extra_split_chars_re.split(token) if x != ""]
                for token in tokens]

    """ Tokenizers """

    def _space_all(self, text):
        def is_whitespace(c):
            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
                    c) == 0x202F:
                return True
            return False

        prev_is_whitespace = True
        tokens = []
        for char in text:
            if is_whitespace(char):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    tokens.append(char)
                else:
                    tokens[-1] += char
                prev_is_whitespace = False
        return tokens

    def _treebank_en(self, text):
        if self.word_tokenizer is None:
            import nltk

            self.word_tokenizer = nltk.TreebankWordTokenizer()

        return [
            token.replace("''", '"').replace("``", '"')
            for token in self.word_tokenizer.tokenize(text)
        ]

    def _spacy_en(self, text):
        if self.word_tokenizer is None:
            from claf.tokens.tokenizer.utils import load_spacy_model_for_tokenizer

            self.word_tokenizer = load_spacy_model_for_tokenizer(
                self.extra_split_chars_re)

        def _remove_spaces(tokens):
            return [token.text for token in tokens if not token.is_space]

        return _remove_spaces(self.word_tokenizer(text))

    def _bert_basic(self, text):
        if self.word_tokenizer is None:
            from pytorch_pretrained_bert.tokenization import BasicTokenizer

            self.word_tokenizer = BasicTokenizer(**self.config)

        return self.word_tokenizer.tokenize(text)

    def _mecab_ko(self, text):
        if self.word_tokenizer is None:
            from konlpy.tag import Mecab

            self.word_tokenizer = Mecab()

        return self.word_tokenizer.morphs(text)
!wget "https://drive.google.com/uc?export=download&id=1ByJN0Vh4ctIwNdnO2jevEcBgrbRHuNyM" -O "ratings_train.txt"
!wget "https://drive.google.com/uc?export=download&id=1fNm-8pQJsuDbFaVIMow1DI-7lsnNLRFB" -O "ratings_test.txt"

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

train_data.head()

len(train_data)

from konlpy.tag import Mecab

mecab  = Mecab()

X_train = [mecab.morphs(sentence) for sentence in train_data['document']]

X_test = [mecab.morphs(sentence) for sentence in test_data['document']]

X_train[:3]#띄어쓰기를 위한 전처리 도구가 있지만 다음 시간에

vocab_size = 21645

tokenizer = Tokenizer(vocab_size, oov_token='OOV') 
tokenizer.fit_on_texts(X_train)

tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
Beispiel #6
0
df['code'].value_counts().plot(kind='bar')

print(df.groupby('code').size().reset_index(name='count'))

"""## 토큰화(Mecab)"""

from konlpy.tag import Mecab
tokenizer = Mecab()

kor_text = '밤에 귀가하던 여성에게 범죄를 시도한 대 남성이 구속됐다서울 제주경찰서는 \
            상해 혐의로 씨를 구속해 수사하고 있다고 일 밝혔다씨는 지난달 일 피해 여성을 \
            인근 지하철 역에서부터 따라가 폭행을 시도하려다가 도망간 혐의를 받는다피해 \
            여성이 저항하자 놀란 씨는 도망갔으며 신고를 받고 주변을 수색하던 경찰에 \
            체포됐다피해 여성은 이 과정에서 경미한 부상을 입은 것으로 전해졌다'

print(tokenizer.morphs(kor_text))

"""## 불용어(stopwords)제거"""

stopwords = ['에','는','은','을','했','에게','있','이','의','하','한','다','과','때문','할','수','무단','따른','및','금지','전재','경향신문','기자','는데','가','등','들','파이낸셜','저작','등','뉴스']

#토큰화 및 토큰화 과정에서 불용어를 제거하는 함수
def preprocessing(data):
  text_data = []

  for sentence in data:
    temp_data =[]
    # 토큰화
    temp_data = tokenizer.morphs(sentence)
    #불용어 제거
    temp_data = [word for word in temp_data if not word in stopwords]
# 은전한닢 VER
''' 
import MeCab

m = MeCab.Tagger()
out = m.parse("안녕하세요")
print(out)
'''

# konply 버전
from konlpy.tag import Mecab
mecab = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")
out = mecab.morphs("오늘저녁먹었어??")
print(out)

# Word2Vec 모듈
from gensim.models import Word2Vec, KeyedVectors

#테스트
training_data_path = "C:\github\python\mecab\TrainingData\\training_data_dialogflow.txt"
model_path = 'C:\github\python\mecab\TrainingData\\training_model'
dialog_file = open(training_data_path, 'rt', encoding='utf-8')
training_word_data = []

##### 형태소 분석 ######
while True:
    # 사용자 발화 값 Line read
    line = dialog_file.readline()

    if line:
        temp = []
Beispiel #8
0
    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    return model


if __name__ == '__main__':
    df = pd.read_table("train_20181105-20181126.txt")
    mecab = Mecab()
    reviews = []
    labels = []
    all_tokens = []
    unique_tokens = dict()
    for i in range(len(df)):
        try:
            tokens = mecab.morphs(df["document"][i])
            reviews.append(tokens)
            labels.append(df["label"][i])

            all_tokens += tokens
            for t in tokens:
                if t in unique_tokens.keys():
                    unique_tokens[t] += 1
                else:
                    unique_tokens[t] = 1
        except:
            pass
    token_to_idx, idx_to_token = create_dictionary(unique_tokens, 100)
    print("Number of using token: ", len(token_to_idx), len(idx_to_token))

    i = 0
Beispiel #9
0
def write_output_files(json_input, out_dir, max_comment_num):
    #각 타이틀에 대해
    #   제목의 whitespace들을 공백문자로 변경
    #   변경된 제목을 공백으로 분리해서 set에 추가
    #   댓글들을 순회하며
    #       댓글의 모든 whitespace들을 공백문자로 변경
    #       title 파일에 제목을 작성하고 comment 파일에 댓글 작성
    #       댓글을 공백으로 분리해서 set에 추가
    join_out_path = lambda f: path.join(out_dir, f)
    open_out_file = lambda p, mod: open(
        join_out_path(p), mod, encoding='utf-8')
    vocab = set()
    total_comment_num = 0
    mecab = Mecab()

    for i, dic in enumerate(json_input):
        title = white_to_space.sub(" ", dic["title"])
        title = remove_special_char.sub("", title)
        print("title:", title, file=sys.stderr)
        morphs = mecab.morphs(title)
        print("title morphs:", morphs, file=sys.stderr)
        vocab.update(morphs)
        title = ' '.join(morphs)
        #comment_it = itertools.islice(map(lambda cmt: white_to_space.sub(" ", cmt), dic["comments"]), max_comment_num - total_comment_num)
        #comments = list(comment_it)

        comments = [
            remove_special_char.sub("", white_to_space.sub(" ", cmt))
            for cmt in dic["comments"]
        ]
        for j, comment in enumerate(comments):
            print("comment:", comment, file=sys.stderr)
            morphs = mecab.morphs(comment)
            print("comment morphs:", morphs, file=sys.stderr)
            vocab.update(morphs)
            comment = ' '.join(morphs)
            comments[j] = comment

        total_comment_num += len(comments)
        train_set_num = int(len(comments) * 0.7)

        is_this_last_item = (i + 1) == len(json_input)
        terminator = '\n' if not is_this_last_item else ''
        open_mode = 'a' if i != 0 else 'w'

        with open_out_file("train.title",
                           open_mode) as train_title, open_out_file(
                               "train.comment", open_mode) as train_comment:
            train_title.write(
                '\n'.join(itertools.repeat(title, train_set_num)) + terminator)
            train_comment.write('\n'.join(comments[:train_set_num]) +
                                terminator)

        with open_out_file("test.title",
                           open_mode) as test_title, open_out_file(
                               "test.comment", open_mode) as test_comment:
            test_title.write('\n'.join(
                itertools.repeat(title,
                                 len(comments) - train_set_num)) + terminator)
            test_comment.write('\n'.join(comments[train_set_num:]) +
                               terminator)

    with open_out_file("vocab.title", 'w') as vocab_title:
        vocab_title.write('<unk>\n<s>\n</s>\n')
        vocab_title.write('\n'.join(vocab))

    shutil.copyfile(join_out_path("test.title"), join_out_path("dev.title"))
    shutil.copyfile(join_out_path("test.comment"),
                    join_out_path("dev.comment"))
Beispiel #10
0
#태그로 사용할 형태소만 추출
pattern = re.compile('MM|NNG|VA[+].*|VV[+].*|XR')
df_tags = pd.DataFrame(columns=['score', 'tags'], dtype='int64')
taglist = []
for place in tagged:
    tag = np.array(place)
    npbool = []
    for t in tag:
        npbool.append(re.fullmatch(pattern, t[1]) != None)
    tag = tag[npbool].tolist()
    taglist.append(tag)
df_tags['tags'] = taglist
df_tags['score'] = df['score'].astype('int64')
print(df_tags['tags'][0])

corpus = [mecab.morphs(sentence) for sentence in array]
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
from gensim.models import Word2Vec
import time
start = time.time()
print("train start")
model = Word2Vec(corpus,
                 size=300,
                 window=10,
                 min_count=10,
                 workers=8,
                 iter=100,
                 sg=1,
                 sample=1e-3)
Beispiel #11
0
# Default Value
if getFileName == "":
    getFileName = "KakaoTalkChats.txt"

with open(getFileName, 'r') as f:
    chatLog = f.readlines()

NLPData = []
wordDic = {}

# Get Chat Content
for i in range(len(chatLog)):
    NLPData.append(chatLog[i].split(":")[-1])

# Processing
for k in range(len(NLPData)):
    wordList = mecab.morphs(NLPData[k])

    # Dictionary Count
    for j in range(len(wordList)):
        wordDic[wordList[j]] = wordDic.get(wordList[j], 0) + 1

# Save DataFile
with open("chatLog.txt", 'w') as f:
    f.write("채팅방 이름: " + chatLog[0])
    sortedDic = sorted(wordDic.items(), key=lambda k: k[1], reverse=True)

    for i in range(len(sortedDic)):
        data = sortedDic[i][0] + " : " + str(sortedDic[i][1]) + "\n"
        f.write(data)
Beispiel #12
0
class ManageData(object):
    def __init__(self,
                 src_token_dict=None,
                 filelist=None,
                 max_len=128,
                 num_class=2):
        self.filelist = filelist
        self.max_len = max_len
        self.num_class = num_class

        self.mecab = Mecab()

        if src_token_dict is not None:
            self.token_dict = load_token_id(src_token_dict)

    def _preprocess(self, sentence):
        try:
            return self.mecab.morphs(sentence)
        except:
            return None

    def _parse_data(self, filename):
        dataset = pd.read_csv(filename, delimiter='\t', header=0)
        sentence_list = dataset['document']
        label_list = dataset['label']

        return sentence_list, label_list

    def _to_onehot(self, label):
        tmp = [0] * self.num_class
        tmp[label] = 1
        return tmp

    def make_input(self, sentence):
        """
        from input sentence. preprocess, map to id, padding
        input sentence: '나는 학교에 간다'
        output processed: [4,22,14,5,253,82,0,0,0, ...]
        """
        processed = self._preprocess(sentence)
        processed = tokenlist2idlist(processed, self.token_dict)
        processed = padding(processed, self.max_len)

        return processed

    def generator(self):
        """ return generator yields preprocessed data
            filelist: tsv based data with 'document', 'label' col
        """
        for filename in self.filelist:
            # parse tsv
            sentences, labels = self._parse_data(filename)

            for sentence, label in zip(sentences, labels):
                try:
                    processed = self.make_input(sentence)

                    if processed == None: continue

                    yield (processed, self._to_onehot(label))
                except:
                    continue
Beispiel #13
0
def get_korean_morphs(words):
    mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")
    return mecab.morphs(words)  #문장을 형태소로 변환하는 유틸리티
from konlpy.tag import Mecab

tokenizer = Mecab()
print(tokenizer.morphs("아버지가방에들어가신다"))
Beispiel #15
0
# import MeCab as mecab
from konlpy.tag import Mecab
mecab = Mecab()

morphs_l = mecab.morphs('영등포구청역에 있는 맛집 좀 알려주세요.')
print('morphs_l')
morphs_txt = ' '.join(morphs_l)

print('morphs_txt')
print(morphs_txt)

nouns = mecab.nouns('영등포구청역에 있는 맛집 좀 알려주세요.')
print('nouns')
print(nouns)
# ['영등포구청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.']
Beispiel #16
0
class Tokenizer(object):
    def __init__(self):
        self.mecab = Mecab()

    def __call__(self, phrase):
        return self.mecab.morphs(phrase=phrase)
Beispiel #17
0
data_path = Path(data_root) / 'kor_pair_train.csv'
data_df = pd.read_csv(data_path)[['question1', 'question2', 'is_duplicate']]
data_df.is_duplicate = data_df.is_duplicate.map(
    lambda x: 1 - x
)  # is_duplicate 레이블 변경 : 중복이면 1, 중복 아니면 0으로 (현재 노테이션이 헷갈려서...)

# train-validation split
train_df, val_df = train_test_split(data_df, test_size=0.2)

# save train / validation data
train_df.to_csv(Path(data_root) / 'tr_pairs.csv', index=False)
val_df.to_csv(Path(data_root) / 'val_pairs.csv', index=False)

# build vocab
tokenizer = Mecab()
tr_tokenized_q1 = [tokenizer.morphs(q) for q in train_df.question1]
tr_tokenized_q2 = [tokenizer.morphs(q) for q in train_df.question2]

counter = nlp.data.count_tokens(
    itertools.chain.from_iterable(
        [tokens for tokens in tr_tokenized_q1 + tr_tokenized_q2]))
vocab = nlp.Vocab(counter=counter, min_freq=10)

# connecting embedding to vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
vocab.set_embedding(ptr_embedding)

# save vocab
vocab_path = Path(data_root) / 'word_vocab.pkl'
with open(vocab_path, mode='wb') as io:
    pickle.dump(vocab, io)
        news_str = re.sub(re.compile('\<'), '', news_str)
        news_str = re.sub(re.compile('■'), '', news_str)
        news_str = re.sub(re.compile('◆'), '', news_str)
        news_str = re.sub(re.compile("'"), '', news_str)
        news_str = re.sub(re.compile('‘'), '', news_str)
        news_str = re.sub(re.compile('’'), '', news_str)
        news_str = re.sub(DATE_PATTERN, ' ', news_str)
        # news_str = re.sub(re.compile("\.", re.UNICODE), '.\n', news_str)
        news_str = re.sub(EMPTY_PARENTHESIS, ' ', news_str)
        news_str = re.sub(NEW_LINE, ' ', news_str)
        news_str = re.sub(MULTIPLE_SPACES, '', news_str)
        news_str = re.sub(re.compile('\\xa0'), '', news_str)
        news_str = re.sub(re.compile("저작권자 SPOTV NEWS 무단전재 및 재배포 금지"), ' ', news_str)


        news_str = news_str.strip()
        # print(news_str)

        tokenizer = Mecab()
        tokens = tokenizer.morphs(news_str)
        corpus_file.writelines("%s " % token for token in tokens)
        corpus_file.write("\n")


        ####### JUST ONE NEWS FOR TEST ########
        # break
        #######################################
        
    
    connection.close()
    corpus_file.close()
Beispiel #19
0
#!/usr/bin/env python3

from konlpy.tag import Hannanum, Kkma, Komoran, Mecab, Okt

hannanum = Hannanum()
print('[Hannanum]')
print(hannanum.analyze('롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.'))

kkma = Kkma()
print('[Kkma]')
print(kkma.morphs('공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.'))

komoran = Komoran()
print('[Komoran]')
print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요'))

mecab = Mecab()
print('[Mecab]')
print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.'))

okt = Okt()
print('[Okt]')
print(okt.morphs(u'단독입찰보다 복수입찰의 경우'))
Beispiel #20
0
ls

cd Mecab-ko-for-Google-Colab/

ls

! bash install_mecab-ko_on_colab190912.sh

!pip install konlpy

from konlpy.tag import Mecab

mecab = Mecab()

print(mecab.morphs('파일을로컬에받아놓아도다운이되는건가?'))

# Commented out IPython magic to ensure Python compatibility.
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

# %matplotlib inline

# from google.colab import files
# uploaded = files.upload()
# print(uploaded.keys())

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
Beispiel #21
0
import pandas as pd
import numpy as np
import tensorflow as tf
import autokeras as ak
from konlpy.tag import Mecab

train = pd.read_csv("1.csv")
test = pd.read_csv("2.csv")
submission = pd.read_csv("sample_submission.csv")

mecab = Mecab()

train['content'] = train['content'].map(lambda x: ' '.join(mecab.morphs(x)))
test['content'] = test['content'].map(lambda x: ' '.join(mecab.morphs(x)))

x_train = train['content'].values
y_train = train['info'].values

input_node = ak.TextInput()
output_node = ak.TextBlock()(input_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(inputs=input_node,
                   outputs=output_node,
                   overwrite=True,
                   max_trials=20)
clf.fit(x_train, y_train, epochs=5)
model = clf.export_model()

tf.keras.backend.clear_session()
model = tf.keras.models.load_model('./auto_model/best_model')
Beispiel #22
0
from konlpy.tag import Mecab
from gensim.models import KeyedVectors

# Data Cleansing
mecab = Mecab()
articlesMorphs = open("articlesMorphs.txt", "w")

for news_name in [
        'chosun_full', 'donga_full', 'hani_full', 'joongang_full', 'kh_full'
]:
    with open('./Data/news/' + news_name + '.txt', 'r') as f:
        lines = f.read()
        sentences = lines.split('.')

        for i in range(len(sentences)):
            dic = mecab.morphs(sentences[i])
            dicSize = len(dic)

            for idx in range(dicSize - 1):
                if (idx == dicSize):
                    break

                else:
                    if (dic[idx] == '새' and dic[(idx + 1)] == '정치'):
                        dic.remove('새')
                        dic.remove('정치')
                        dic.insert(idx, "새정치")
                        dicSize = dicSize - 1

            for word in dic:
                articlesMorphs.write("%s " % word)
Beispiel #23
0
def tokenize(sentence):
    tagger = Mecab()
    s = " ".join(tagger.morphs(sentence))
    logger.info("tokenized:" + s)
    return s
Beispiel #24
0
data_path = "info/train.txt"
vocab_path = "info/mecab-vocab.json"
save_path = "info/mecab-embedding"
embedding_size = 100
vocab_size = 10000
seed = 100

np.random.seed(seed=seed)
tokenizer = Mecab()

data = read_text(data_path)
data = make_texts(data)
data = [preprocess_text(text) for text in data]

tokenized_texts = [tokenizer.morphs(text) for text in tqdm(data)]

model = Word2Vec(sentences=tokenized_texts,
                 size=embedding_size,
                 window=5,
                 min_count=5,
                 workers=4,
                 sg=0)

word2vec = model.wv
vocab = read_json(vocab_path)

vectors = []
for key, value in vocab.items():
    try:
        vectors.append(word2vec[key])
Beispiel #25
0
#   - 위와 같은 상황을 방지하기 위해서 한국어는 보편적으로 '형태소 분석기'로 토큰화를 합니다.
#   - 여기서는 형태소 분석기 중에서 mecab을 사용해보겠습니다.
#   - 아래의 커맨드로 colab에서 mecab을 설치합니다.
#   - 앞선 예와 다르게 '의', '를', '가', '랑' 등이 전부 분리되어 기계는 '사과'라는 단어를 하나의 단어로 처리할 수 있습니다.
#
# ```
# git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# cd Mecab-ko-for-Google-Colab
# chmod u+x install_mecab-ko_on_colab190912.sh
# ./install_mecab-ko_on_colab190912.sh
# ```

from konlpy.tag import Mecab

tokenizer = Mecab()
mu.log("tokenizer.morphs(kor_text)", tokenizer.morphs(kor_text))

################################################################################
# - 단어 집합(Vocabulary) 생성
#   - 단어 집합(vocabuary)이란 중복을 제거한 텍스트의 총 단어의 집합(set)을 의미합니다.
#   - 우선, 실습을 위해서 깃허브에서 '네이버 영화 리뷰 분류하기' 데이터를 다운로드하겠습니다.
#   - 네이버 영화 리뷰 데이터는 총 20만 개의 영화 리뷰를 긍정 1, 부정 0으로 레이블링한 데이터입니다.
#
# ```
# pip3 install pandas
# ```

import urllib.request
import pandas as pd
from konlpy.tag import Mecab
from nltk import FreqDist
#     inputs = inputs
#     targets = targets
#     preds = model(inputs)
#     preds = preds.round()
#     num_hit+=torch.eq(preds.squeeze(),targets.squeeze()).sum().item() #data[0]

# print(num_hit/len(test_data)*100)


### test
# test_inputs = ["헐 진짜 개별로다..", "진짜 너무 재밌는 영화다 오랜만에","오..이건 진짜 봐야함", "진짜 쓰레기 같은 영화","노잼","존잼","꾸울잼","핵노잼",'또 보고싶다', '꼬옥 봐야한다.. 진짜..', '나만 보기 아깝다', '돈이 아깝다', '나만 보기 억울하다', '나만 당할 수 없다', '너도 봐야한다', '혼자 본게 정말 후회된다. 이건 꼭 같이 봐야한다.', '재미없어요...', '꾸르르르르르르잼', '꾸르르르잼', '꾸르잼', '이 영화를 보고 암이 나았습니다.']
test_inputs = ['이 영화를 보고 암이 나았습니다.']


for test_input in test_inputs:
    tokenized = tagger.morphs(test_input)
    tokenized = pad_under_five(tokenized)
    input_ = TEXT.numericalize([tokenized], device=DEVICE)
    print (input_)
    if USE_CUDA: input_ = input_.cuda()

    prediction = model(input_)
    prediction = prediction.round()
    prediction = "긍정" if prediction.data[0][0] == 1 else "부정"
    if prediction=="긍정":
        print(test_input,"\033[1;01;36m" + prediction + "\033[0m")
        # print(len(tokenized), tokenized)
    else:
        print(test_input,"\033[1;01;31m" + prediction + "\033[0m")
        # print(len(tokenized), tokenized)
 def morphs_analysis(self, data):
     mecab = Mecab(self.path)
     ls = []
     for x in tqdm(range(len(data))):
         ls.append(mecab.morphs(data[x]))
     return list(chain.from_iterable(ls))