## Mecab """ !git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git cd Mecab-ko-for-Google-Colab ls !bash install_mecab-ko_on_colab190912.sh from konlpy.tag import Mecab mecab = Mecab() print(mecab.morphs('노트북 사고 싶다')) """## 데이터 로드""" import pandas as pd import numpy as np import matplotlib.pyplot as plt import urllib.request from collections import Counter from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt", filename="ratings_total.txt") total_data = pd.read_table('ratings_total.txt', names=['ratings', 'reviews'])
from konlpy.tag import Mecab, Okt, Kkma, Twitter, Komoran #Mecab 현존 가장 빠름 ( > Twitter ) #Oak Twitter에서 이름이 변경됨 #KKma 정확한 품사 정보를 추출 #Komoran 정확성, 시간 모두 중요할때 text = "한글 자연어 처리는 재밌다 이제부터 열심히 해야지 ㅎㅎㅎ" mecab = Mecab() mecab.morphs(text) kkma = Kkma() kkma.morphs(text) komoran = Komoran() komoran.morphs(text) okt = Okt() okt.morphs(text, stem=True) okt.pos(text, stem=True) #스테밍해서 품사태깅 okt.pos(text, join=True) #품사태깅을 붙여서 리스트화
def predict(text): top_k = 5 saved_data = torch.load( 'kcbert_novalid_9.pth', map_location='cpu') # map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id train_config = saved_data['config'] bert_best = saved_data['bert'] index_to_label = saved_data['classes'] lines = read_text(text) mecab = Mecab() mc_lines = mecab.morphs(lines[0]) texts = ' '.join(mc_lines) print(texts) with torch.no_grad(): # Declare model and load pre-trained weights. tokenizer = AutoTokenizer.from_pretrained(train_config.pretrained_model_name) model = BertForSequenceClassification.from_pretrained( train_config.pretrained_model_name, num_labels=len(index_to_label) ) model.load_state_dict(bert_best) if gpu_id >= 0: model.cuda(gpu_id) device = next(model.parameters()).device # Don't forget turn-on evaluation mode. model.eval() mini_batch = tokenizer( [texts], padding=True, truncation=True, return_tensors="pt", ) x = mini_batch['input_ids'] x = x.to(device) mask = mini_batch['attention_mask'] mask = mask.to(device) # Take feed-forward y_hat = model(x, attention_mask=mask)[0] # print(y_hat) probs = nn.Softmax(dim=1) softmax_output = probs(y_hat) print(softmax_output) probs, indice = softmax_output.cpu().topk(top_k) # |indice| = (len(lines), top_k) print(indice) print(probs) results = [index_to_label[int(indice[0][j])] for j in range(top_k)] probs = [round(probs[0][j].item(),4) for j in range(top_k)] res = list(zip(results,probs)) return res
class WordTokenizer(Tokenizer): """ Word Tokenizer * Args: name: tokenizer name [treebank_en|spacy_en|mecab_ko|bert_basic] * Kwargs: flatten: return type as flatten list split_with_regex: post split action. Split tokens that the tokenizer cannot split. """ def __init__(self, name, sent_tokenizer, config={}, split_with_regex=True): super(WordTokenizer, self).__init__(name, f"word-{name}+{sent_tokenizer.cache_name}") self.config = config self.sent_tokenizer = sent_tokenizer self.word_tokenizer = None self.split_with_regex = split_with_regex if split_with_regex: self.extra_split_chars_re = self.make_split_regex_expression() def make_split_regex_expression(self): """ Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA """ extra_split_chars = ( "-", "£", "€", "¥", "¢", "₹", "*", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\ud01C", "\u2019", "\u201D", "\u2018", "\u00B0", ".", ":", ) extra_split_tokens = ( "``", "(?<=[^_])_(?=[^_])", # dashes w/o a preceeding or following dash, so __wow___ -> ___ wow ___ "''", "[" + "".join(extra_split_chars) + "]", ) return re.compile("(" + "|".join(extra_split_tokens) + ")") @overrides def _tokenize(self, text, unit="text"): """ Text -> word tokens """ if type(text) != str: raise ValueError(f"text type is must be str. not {type(text)}") if unit == "sentence": tokens = getattr(self, f"_{self.name}")(text) else: sentences = self.sent_tokenizer.tokenize(text) tokens = [ getattr(self, f"_{self.name}")(sentence) for sentence in sentences ] if self.split_with_regex and self.name != "spacy_en": tokens = self._split_with_regex(tokens) return list(common_utils.flatten(tokens)) def _split_with_regex(self, sentences): for i, sentence in enumerate(sentences): sentences[i] = [ token for token in self._post_split_tokens(sentence) ] return sentences def _post_split_tokens(self, tokens): return [[x for x in self.extra_split_chars_re.split(token) if x != ""] for token in tokens] """ Tokenizers """ def _space_all(self, text): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: return True return False prev_is_whitespace = True tokens = [] for char in text: if is_whitespace(char): prev_is_whitespace = True else: if prev_is_whitespace: tokens.append(char) else: tokens[-1] += char prev_is_whitespace = False return tokens def _treebank_en(self, text): if self.word_tokenizer is None: import nltk self.word_tokenizer = nltk.TreebankWordTokenizer() return [ token.replace("''", '"').replace("``", '"') for token in self.word_tokenizer.tokenize(text) ] def _spacy_en(self, text): if self.word_tokenizer is None: from claf.tokens.tokenizer.utils import load_spacy_model_for_tokenizer self.word_tokenizer = load_spacy_model_for_tokenizer( self.extra_split_chars_re) def _remove_spaces(tokens): return [token.text for token in tokens if not token.is_space] return _remove_spaces(self.word_tokenizer(text)) def _bert_basic(self, text): if self.word_tokenizer is None: from pytorch_pretrained_bert.tokenization import BasicTokenizer self.word_tokenizer = BasicTokenizer(**self.config) return self.word_tokenizer.tokenize(text) def _mecab_ko(self, text): if self.word_tokenizer is None: from konlpy.tag import Mecab self.word_tokenizer = Mecab() return self.word_tokenizer.morphs(text)
!wget "https://drive.google.com/uc?export=download&id=1ByJN0Vh4ctIwNdnO2jevEcBgrbRHuNyM" -O "ratings_train.txt" !wget "https://drive.google.com/uc?export=download&id=1fNm-8pQJsuDbFaVIMow1DI-7lsnNLRFB" -O "ratings_test.txt" train_data = pd.read_table('ratings_train.txt') test_data = pd.read_table('ratings_test.txt') train_data.head() len(train_data) from konlpy.tag import Mecab mecab = Mecab() X_train = [mecab.morphs(sentence) for sentence in train_data['document']] X_test = [mecab.morphs(sentence) for sentence in test_data['document']] X_train[:3]#띄어쓰기를 위한 전처리 도구가 있지만 다음 시간에 vocab_size = 21645 tokenizer = Tokenizer(vocab_size, oov_token='OOV') tokenizer.fit_on_texts(X_train) tokenizer.word_index X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test)
df['code'].value_counts().plot(kind='bar') print(df.groupby('code').size().reset_index(name='count')) """## 토큰화(Mecab)""" from konlpy.tag import Mecab tokenizer = Mecab() kor_text = '밤에 귀가하던 여성에게 범죄를 시도한 대 남성이 구속됐다서울 제주경찰서는 \ 상해 혐의로 씨를 구속해 수사하고 있다고 일 밝혔다씨는 지난달 일 피해 여성을 \ 인근 지하철 역에서부터 따라가 폭행을 시도하려다가 도망간 혐의를 받는다피해 \ 여성이 저항하자 놀란 씨는 도망갔으며 신고를 받고 주변을 수색하던 경찰에 \ 체포됐다피해 여성은 이 과정에서 경미한 부상을 입은 것으로 전해졌다' print(tokenizer.morphs(kor_text)) """## 불용어(stopwords)제거""" stopwords = ['에','는','은','을','했','에게','있','이','의','하','한','다','과','때문','할','수','무단','따른','및','금지','전재','경향신문','기자','는데','가','등','들','파이낸셜','저작','등','뉴스'] #토큰화 및 토큰화 과정에서 불용어를 제거하는 함수 def preprocessing(data): text_data = [] for sentence in data: temp_data =[] # 토큰화 temp_data = tokenizer.morphs(sentence) #불용어 제거 temp_data = [word for word in temp_data if not word in stopwords]
# 은전한닢 VER ''' import MeCab m = MeCab.Tagger() out = m.parse("안녕하세요") print(out) ''' # konply 버전 from konlpy.tag import Mecab mecab = Mecab(dicpath=r"C:\mecab\mecab-ko-dic") out = mecab.morphs("오늘저녁먹었어??") print(out) # Word2Vec 모듈 from gensim.models import Word2Vec, KeyedVectors #테스트 training_data_path = "C:\github\python\mecab\TrainingData\\training_data_dialogflow.txt" model_path = 'C:\github\python\mecab\TrainingData\\training_model' dialog_file = open(training_data_path, 'rt', encoding='utf-8') training_word_data = [] ##### 형태소 분석 ###### while True: # 사용자 발화 값 Line read line = dialog_file.readline() if line: temp = []
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"]) return model if __name__ == '__main__': df = pd.read_table("train_20181105-20181126.txt") mecab = Mecab() reviews = [] labels = [] all_tokens = [] unique_tokens = dict() for i in range(len(df)): try: tokens = mecab.morphs(df["document"][i]) reviews.append(tokens) labels.append(df["label"][i]) all_tokens += tokens for t in tokens: if t in unique_tokens.keys(): unique_tokens[t] += 1 else: unique_tokens[t] = 1 except: pass token_to_idx, idx_to_token = create_dictionary(unique_tokens, 100) print("Number of using token: ", len(token_to_idx), len(idx_to_token)) i = 0
def write_output_files(json_input, out_dir, max_comment_num): #각 타이틀에 대해 # 제목의 whitespace들을 공백문자로 변경 # 변경된 제목을 공백으로 분리해서 set에 추가 # 댓글들을 순회하며 # 댓글의 모든 whitespace들을 공백문자로 변경 # title 파일에 제목을 작성하고 comment 파일에 댓글 작성 # 댓글을 공백으로 분리해서 set에 추가 join_out_path = lambda f: path.join(out_dir, f) open_out_file = lambda p, mod: open( join_out_path(p), mod, encoding='utf-8') vocab = set() total_comment_num = 0 mecab = Mecab() for i, dic in enumerate(json_input): title = white_to_space.sub(" ", dic["title"]) title = remove_special_char.sub("", title) print("title:", title, file=sys.stderr) morphs = mecab.morphs(title) print("title morphs:", morphs, file=sys.stderr) vocab.update(morphs) title = ' '.join(morphs) #comment_it = itertools.islice(map(lambda cmt: white_to_space.sub(" ", cmt), dic["comments"]), max_comment_num - total_comment_num) #comments = list(comment_it) comments = [ remove_special_char.sub("", white_to_space.sub(" ", cmt)) for cmt in dic["comments"] ] for j, comment in enumerate(comments): print("comment:", comment, file=sys.stderr) morphs = mecab.morphs(comment) print("comment morphs:", morphs, file=sys.stderr) vocab.update(morphs) comment = ' '.join(morphs) comments[j] = comment total_comment_num += len(comments) train_set_num = int(len(comments) * 0.7) is_this_last_item = (i + 1) == len(json_input) terminator = '\n' if not is_this_last_item else '' open_mode = 'a' if i != 0 else 'w' with open_out_file("train.title", open_mode) as train_title, open_out_file( "train.comment", open_mode) as train_comment: train_title.write( '\n'.join(itertools.repeat(title, train_set_num)) + terminator) train_comment.write('\n'.join(comments[:train_set_num]) + terminator) with open_out_file("test.title", open_mode) as test_title, open_out_file( "test.comment", open_mode) as test_comment: test_title.write('\n'.join( itertools.repeat(title, len(comments) - train_set_num)) + terminator) test_comment.write('\n'.join(comments[train_set_num:]) + terminator) with open_out_file("vocab.title", 'w') as vocab_title: vocab_title.write('<unk>\n<s>\n</s>\n') vocab_title.write('\n'.join(vocab)) shutil.copyfile(join_out_path("test.title"), join_out_path("dev.title")) shutil.copyfile(join_out_path("test.comment"), join_out_path("dev.comment"))
#태그로 사용할 형태소만 추출 pattern = re.compile('MM|NNG|VA[+].*|VV[+].*|XR') df_tags = pd.DataFrame(columns=['score', 'tags'], dtype='int64') taglist = [] for place in tagged: tag = np.array(place) npbool = [] for t in tag: npbool.append(re.fullmatch(pattern, t[1]) != None) tag = tag[npbool].tolist() taglist.append(tag) df_tags['tags'] = taglist df_tags['score'] = df['score'].astype('int64') print(df_tags['tags'][0]) corpus = [mecab.morphs(sentence) for sentence in array] import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim.models import Word2Vec import time start = time.time() print("train start") model = Word2Vec(corpus, size=300, window=10, min_count=10, workers=8, iter=100, sg=1, sample=1e-3)
# Default Value if getFileName == "": getFileName = "KakaoTalkChats.txt" with open(getFileName, 'r') as f: chatLog = f.readlines() NLPData = [] wordDic = {} # Get Chat Content for i in range(len(chatLog)): NLPData.append(chatLog[i].split(":")[-1]) # Processing for k in range(len(NLPData)): wordList = mecab.morphs(NLPData[k]) # Dictionary Count for j in range(len(wordList)): wordDic[wordList[j]] = wordDic.get(wordList[j], 0) + 1 # Save DataFile with open("chatLog.txt", 'w') as f: f.write("채팅방 이름: " + chatLog[0]) sortedDic = sorted(wordDic.items(), key=lambda k: k[1], reverse=True) for i in range(len(sortedDic)): data = sortedDic[i][0] + " : " + str(sortedDic[i][1]) + "\n" f.write(data)
class ManageData(object): def __init__(self, src_token_dict=None, filelist=None, max_len=128, num_class=2): self.filelist = filelist self.max_len = max_len self.num_class = num_class self.mecab = Mecab() if src_token_dict is not None: self.token_dict = load_token_id(src_token_dict) def _preprocess(self, sentence): try: return self.mecab.morphs(sentence) except: return None def _parse_data(self, filename): dataset = pd.read_csv(filename, delimiter='\t', header=0) sentence_list = dataset['document'] label_list = dataset['label'] return sentence_list, label_list def _to_onehot(self, label): tmp = [0] * self.num_class tmp[label] = 1 return tmp def make_input(self, sentence): """ from input sentence. preprocess, map to id, padding input sentence: '나는 학교에 간다' output processed: [4,22,14,5,253,82,0,0,0, ...] """ processed = self._preprocess(sentence) processed = tokenlist2idlist(processed, self.token_dict) processed = padding(processed, self.max_len) return processed def generator(self): """ return generator yields preprocessed data filelist: tsv based data with 'document', 'label' col """ for filename in self.filelist: # parse tsv sentences, labels = self._parse_data(filename) for sentence, label in zip(sentences, labels): try: processed = self.make_input(sentence) if processed == None: continue yield (processed, self._to_onehot(label)) except: continue
def get_korean_morphs(words): mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic") return mecab.morphs(words) #문장을 형태소로 변환하는 유틸리티
from konlpy.tag import Mecab tokenizer = Mecab() print(tokenizer.morphs("아버지가방에들어가신다"))
# import MeCab as mecab from konlpy.tag import Mecab mecab = Mecab() morphs_l = mecab.morphs('영등포구청역에 있는 맛집 좀 알려주세요.') print('morphs_l') morphs_txt = ' '.join(morphs_l) print('morphs_txt') print(morphs_txt) nouns = mecab.nouns('영등포구청역에 있는 맛집 좀 알려주세요.') print('nouns') print(nouns) # ['영등포구청역', '에', '있', '는', '맛집', '좀', '알려', '주', '세요', '.']
class Tokenizer(object): def __init__(self): self.mecab = Mecab() def __call__(self, phrase): return self.mecab.morphs(phrase=phrase)
data_path = Path(data_root) / 'kor_pair_train.csv' data_df = pd.read_csv(data_path)[['question1', 'question2', 'is_duplicate']] data_df.is_duplicate = data_df.is_duplicate.map( lambda x: 1 - x ) # is_duplicate 레이블 변경 : 중복이면 1, 중복 아니면 0으로 (현재 노테이션이 헷갈려서...) # train-validation split train_df, val_df = train_test_split(data_df, test_size=0.2) # save train / validation data train_df.to_csv(Path(data_root) / 'tr_pairs.csv', index=False) val_df.to_csv(Path(data_root) / 'val_pairs.csv', index=False) # build vocab tokenizer = Mecab() tr_tokenized_q1 = [tokenizer.morphs(q) for q in train_df.question1] tr_tokenized_q2 = [tokenizer.morphs(q) for q in train_df.question2] counter = nlp.data.count_tokens( itertools.chain.from_iterable( [tokens for tokens in tr_tokenized_q1 + tr_tokenized_q2])) vocab = nlp.Vocab(counter=counter, min_freq=10) # connecting embedding to vocab ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') vocab.set_embedding(ptr_embedding) # save vocab vocab_path = Path(data_root) / 'word_vocab.pkl' with open(vocab_path, mode='wb') as io: pickle.dump(vocab, io)
news_str = re.sub(re.compile('\<'), '', news_str) news_str = re.sub(re.compile('■'), '', news_str) news_str = re.sub(re.compile('◆'), '', news_str) news_str = re.sub(re.compile("'"), '', news_str) news_str = re.sub(re.compile('‘'), '', news_str) news_str = re.sub(re.compile('’'), '', news_str) news_str = re.sub(DATE_PATTERN, ' ', news_str) # news_str = re.sub(re.compile("\.", re.UNICODE), '.\n', news_str) news_str = re.sub(EMPTY_PARENTHESIS, ' ', news_str) news_str = re.sub(NEW_LINE, ' ', news_str) news_str = re.sub(MULTIPLE_SPACES, '', news_str) news_str = re.sub(re.compile('\\xa0'), '', news_str) news_str = re.sub(re.compile("저작권자 SPOTV NEWS 무단전재 및 재배포 금지"), ' ', news_str) news_str = news_str.strip() # print(news_str) tokenizer = Mecab() tokens = tokenizer.morphs(news_str) corpus_file.writelines("%s " % token for token in tokens) corpus_file.write("\n") ####### JUST ONE NEWS FOR TEST ######## # break ####################################### connection.close() corpus_file.close()
#!/usr/bin/env python3 from konlpy.tag import Hannanum, Kkma, Komoran, Mecab, Okt hannanum = Hannanum() print('[Hannanum]') print(hannanum.analyze('롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다.')) kkma = Kkma() print('[Kkma]') print(kkma.morphs('공부를 하면할수록 모르는게 많다는 것을 알게 됩니다.')) komoran = Komoran() print('[Komoran]') print(komoran.morphs(u'우왕 코모란도 오픈소스가 되었어요')) mecab = Mecab() print('[Mecab]') print(mecab.morphs(u'영등포구청역에 있는 맛집 좀 알려주세요.')) okt = Okt() print('[Okt]') print(okt.morphs(u'단독입찰보다 복수입찰의 경우'))
ls cd Mecab-ko-for-Google-Colab/ ls ! bash install_mecab-ko_on_colab190912.sh !pip install konlpy from konlpy.tag import Mecab mecab = Mecab() print(mecab.morphs('파일을로컬에받아놓아도다운이되는건가?')) # Commented out IPython magic to ensure Python compatibility. import os import matplotlib.pyplot as plt import tensorflow as tf import numpy as np # %matplotlib inline # from google.colab import files # uploaded = files.upload() # print(uploaded.keys()) # for fn in uploaded.keys(): # print('User uploaded file "{name}" with length {length} bytes'.format(
import pandas as pd import numpy as np import tensorflow as tf import autokeras as ak from konlpy.tag import Mecab train = pd.read_csv("1.csv") test = pd.read_csv("2.csv") submission = pd.read_csv("sample_submission.csv") mecab = Mecab() train['content'] = train['content'].map(lambda x: ' '.join(mecab.morphs(x))) test['content'] = test['content'].map(lambda x: ' '.join(mecab.morphs(x))) x_train = train['content'].values y_train = train['info'].values input_node = ak.TextInput() output_node = ak.TextBlock()(input_node) output_node = ak.ClassificationHead()(output_node) clf = ak.AutoModel(inputs=input_node, outputs=output_node, overwrite=True, max_trials=20) clf.fit(x_train, y_train, epochs=5) model = clf.export_model() tf.keras.backend.clear_session() model = tf.keras.models.load_model('./auto_model/best_model')
from konlpy.tag import Mecab from gensim.models import KeyedVectors # Data Cleansing mecab = Mecab() articlesMorphs = open("articlesMorphs.txt", "w") for news_name in [ 'chosun_full', 'donga_full', 'hani_full', 'joongang_full', 'kh_full' ]: with open('./Data/news/' + news_name + '.txt', 'r') as f: lines = f.read() sentences = lines.split('.') for i in range(len(sentences)): dic = mecab.morphs(sentences[i]) dicSize = len(dic) for idx in range(dicSize - 1): if (idx == dicSize): break else: if (dic[idx] == '새' and dic[(idx + 1)] == '정치'): dic.remove('새') dic.remove('정치') dic.insert(idx, "새정치") dicSize = dicSize - 1 for word in dic: articlesMorphs.write("%s " % word)
def tokenize(sentence): tagger = Mecab() s = " ".join(tagger.morphs(sentence)) logger.info("tokenized:" + s) return s
data_path = "info/train.txt" vocab_path = "info/mecab-vocab.json" save_path = "info/mecab-embedding" embedding_size = 100 vocab_size = 10000 seed = 100 np.random.seed(seed=seed) tokenizer = Mecab() data = read_text(data_path) data = make_texts(data) data = [preprocess_text(text) for text in data] tokenized_texts = [tokenizer.morphs(text) for text in tqdm(data)] model = Word2Vec(sentences=tokenized_texts, size=embedding_size, window=5, min_count=5, workers=4, sg=0) word2vec = model.wv vocab = read_json(vocab_path) vectors = [] for key, value in vocab.items(): try: vectors.append(word2vec[key])
# - 위와 같은 상황을 방지하기 위해서 한국어는 보편적으로 '형태소 분석기'로 토큰화를 합니다. # - 여기서는 형태소 분석기 중에서 mecab을 사용해보겠습니다. # - 아래의 커맨드로 colab에서 mecab을 설치합니다. # - 앞선 예와 다르게 '의', '를', '가', '랑' 등이 전부 분리되어 기계는 '사과'라는 단어를 하나의 단어로 처리할 수 있습니다. # # ``` # git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git # cd Mecab-ko-for-Google-Colab # chmod u+x install_mecab-ko_on_colab190912.sh # ./install_mecab-ko_on_colab190912.sh # ``` from konlpy.tag import Mecab tokenizer = Mecab() mu.log("tokenizer.morphs(kor_text)", tokenizer.morphs(kor_text)) ################################################################################ # - 단어 집합(Vocabulary) 생성 # - 단어 집합(vocabuary)이란 중복을 제거한 텍스트의 총 단어의 집합(set)을 의미합니다. # - 우선, 실습을 위해서 깃허브에서 '네이버 영화 리뷰 분류하기' 데이터를 다운로드하겠습니다. # - 네이버 영화 리뷰 데이터는 총 20만 개의 영화 리뷰를 긍정 1, 부정 0으로 레이블링한 데이터입니다. # # ``` # pip3 install pandas # ``` import urllib.request import pandas as pd from konlpy.tag import Mecab from nltk import FreqDist
# inputs = inputs # targets = targets # preds = model(inputs) # preds = preds.round() # num_hit+=torch.eq(preds.squeeze(),targets.squeeze()).sum().item() #data[0] # print(num_hit/len(test_data)*100) ### test # test_inputs = ["헐 진짜 개별로다..", "진짜 너무 재밌는 영화다 오랜만에","오..이건 진짜 봐야함", "진짜 쓰레기 같은 영화","노잼","존잼","꾸울잼","핵노잼",'또 보고싶다', '꼬옥 봐야한다.. 진짜..', '나만 보기 아깝다', '돈이 아깝다', '나만 보기 억울하다', '나만 당할 수 없다', '너도 봐야한다', '혼자 본게 정말 후회된다. 이건 꼭 같이 봐야한다.', '재미없어요...', '꾸르르르르르르잼', '꾸르르르잼', '꾸르잼', '이 영화를 보고 암이 나았습니다.'] test_inputs = ['이 영화를 보고 암이 나았습니다.'] for test_input in test_inputs: tokenized = tagger.morphs(test_input) tokenized = pad_under_five(tokenized) input_ = TEXT.numericalize([tokenized], device=DEVICE) print (input_) if USE_CUDA: input_ = input_.cuda() prediction = model(input_) prediction = prediction.round() prediction = "긍정" if prediction.data[0][0] == 1 else "부정" if prediction=="긍정": print(test_input,"\033[1;01;36m" + prediction + "\033[0m") # print(len(tokenized), tokenized) else: print(test_input,"\033[1;01;31m" + prediction + "\033[0m") # print(len(tokenized), tokenized)
def morphs_analysis(self, data): mecab = Mecab(self.path) ls = [] for x in tqdm(range(len(data))): ls.append(mecab.morphs(data[x])) return list(chain.from_iterable(ls))