def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n\n')
def clean_csv(dataset_file_dir, merged_file_save_path, ignore_list): sentence_list = [] for filepath in os.listdir(dataset_file_dir): if filepath.endswith(".csv"): entire_path = os.path.join(dataset_file_dir, filepath) with open(entire_path, newline="") as word_file: csv_reader = csv.reader(word_file) for row in csv_reader: sentence_list.append(row) tokenized_sentence_list = [] tokenizer = RegexTokenizer() count = 0 for sentence in sentence_list: tokenized_sentence = tokenizer.tokenize(str(sentence)) clean_sentence = [ elem for elem in tokenized_sentence if is_valid_word(elem, ignore_list) ] tokenized_sentence_list.append(clean_sentence) # print(tokenized_sentence) count += 1 file = open(merged_file_save_path, 'w', encoding='utf-8', newline='') writer = csv.writer(file) for sentence in tokenized_sentence_list: writer.writerow(sentence) file.close()
def convert_to_vector_list(self, ignore_list, model_length, sentence): tokenizer = RegexTokenizer() tokenized_sentence = tokenizer.tokenize(str(sentence)) print(self.key_vector_path) kv = KeyedVectors.load(self.key_vector_path, mmap='r') clean_sentence = [ elem for elem in tokenized_sentence if csv_reader.is_valid_word(elem, ignore_list) ] vector = [] for elem in clean_sentence: try: array = kv[elem] except: array = [1] * 100 vector.append(array) vector_list = [] while (len(vector_list) < model_length): vector_list += vector if (len(vector_list) > model_length): vector_list = vector_list[:model_length] return np.array(vector_list)
class RegexTokenizerKorean(SpecialTokenizer): def __init__(self): from soynlp.tokenizer import RegexTokenizer self.inst = RegexTokenizer() self.OUT_TYPE = [list, str] def __call__(self, *args, **kwargs): tokens = self.inst.tokenize(args[0]) return tokens
def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError( "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}". format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({ '데이터': 0.4, '데이': 0.35, '데이터센터': 0.38 }) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n')
def prepare_corpus(self, ignore_list, model_length, corpus_path): tokenizer = RegexTokenizer() data_list = [] label_list = [] myw2v = w2v.word2vec(self.model_path) myw2v.load_keyvector(self.key_vector_path) with open(corpus_path, newline='') as corpus_file: reader = csv.reader(corpus_file) for row in reader: sentence = row[0] label = row[1] # uncensored data if label == '1': label = [1, 0] # Censored data else: label = [0, 1] tokenized_sentence = tokenizer.tokenize(str(sentence)) clean_sentence = [ elem for elem in tokenized_sentence if csv_reader.is_valid_word(elem, ignore_list) ] vector = [myw2v.get_vector(elem) for elem in clean_sentence] print("length: " + str(len(vector))) if (len(vector) > 0): vector_list = [] while (len(vector_list) < model_length): vector_list += vector if (len(vector_list) > model_length): vector_list = vector_list[:model_length] # print(np.array(vector_list).shape) data_list.append(np.array(vector_list)) label_list.append(np.array(label)) train_input = data_list train_label = label_list return (train_input, train_label)
def word2vec(user_file='./review_01_0005_72378155.txt'): tokenizer = RegexTokenizer() sents = [] file = open(user_file, 'r', encoding='UTF-8', newline='') while True: line = file.readline() line = re.sub('\s*\n', '', line) if "-----------------" not in line: sents.append(line) if len(sents) > 5000: break tokenized_contents = [] for sent in sents: temp = tokenizer.tokenize(sent, flatten=True) tokenized_contents.append(temp) embedding_model = Word2Vec(tokenized_contents, size=100, window=5, min_count=2, workers=4, iter=100, sg=1) while True: print("User input : ") user_input = input() if user_input is "": break else: try: result = embedding_model.most_similar(positive=[user_input], topn=5) for elem in result: print(elem) except Exception: print("ERROR : 결과가 없습니다.")
from soynlp.tokenizer import RegexTokenizer import konlpy tok = konlpy.tag.Mecab() tokenizer = RegexTokenizer() print(tok.morphs('동일하게 테스트 중입니다')) print(tokenizer.tokenize('테스트 중이다'))
def Tokenize(data): tokenizer = RegexTokenizer() output = list(map(lambda x: ' '.join(tokenizer.tokenize(x)), data)) return output
import sqlite3 import token_word_judge import os import sys import json user_name = sys.argv[1] # user가 word input user_input_word = sys.argv[2] # input_number = input('여기에 숫자를 입력하세요 : ') # RegexTokenizer's object 생성 tokenizer = RegexTokenizer() # input한 word token화 new_token_list = tokenizer.tokenize(user_input_word) # token_list를 영어버젼도! final_token_list = token_word_judge.google_translator(new_token_list) # 0.1초 이거만하면 # print(new_token_list) # final_token_list = [] # # 명사만 추출하는 code # for new_token in new_token_list: # # str인 것은 모두 영어이거나, 한글 단독 명사다. # if type(token_word_judge.token_judge_en_lower_ko_noun(new_token)) == str: # final_token_list.append(token_word_judge.token_judge_en_lower_ko_noun(new_token)) # # list 인 것은 명사가 여러개인 것이 list로 묶인다.
# In[40]: df = pd.DataFrame(review_list, columns=['review']) # In[41]: from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer tokenizer = RegexTokenizer() tokenizer # In[42]: parsed_list = [] for i in df['review']: temp = tokenizer.tokenize(i) parsed_list.append(temp) df['review_parsed'] = parsed_list #print(df) # In[43]: STOP_WORDS = ['.', '(', ')', '!', '[', ']', '▣', '※'] # In[44]: def remove_stopwords(tokens): return [t for t in tokens if t not in STOP_WORDS]
def db_sentence_2_token_list(database_history_all_users_data_list): """ 설명 : 긴 sentence를 RegexTokenizer로 token으로 나눠서 title에 있던 자리에 다시 담는다. input : Sentence들을 모아둔 list ex) input : [['computer', '의', 'Youtube', '채널', '확인하기'], ----] return : result (type = list) result : [['computer', 'https://www.youtube.com/', 36, 3], ['의', 'https://www.youtube.com/', 36, 3], ['youtube', 'https://www.youtube.com/', 36, 3], ['채널', 'https://www.youtube.com/', 36, 3], ['확인', 'https://www.youtube.com/', 36,3] ------ ] """ # token tokenizer = RegexTokenizer() result = [] # DB 한 줄씩 읽어들이기 for line in database_history_all_users_data_list: """ output : ('https://www.youtube.com/', 'YouTube', 36) """ # output이 tuple이어서 url, title, visit_count, user_count = line # title의 text를 word로 끊어버리기 title_list = tokenizer.tokenize(title) # title이 빈공간인 건 제외 if len(title_list) == 0: continue else: for word in title_list: judgement = kor_or_eng_judge(word) # judgement 가 영어 한글이 아닐 경우 if judgement == 0: pass # judgement가 영어 경우 : 영어인 경우 lower한 단어 입력 elif judgement == 'en': result.append([ token_judge_en_lower_ko_noun(word), url, visit_count, user_count ]) # judgement가 영어 경우 : 한글인 경우 lower한 단어 입력 elif judgement == 'ko': if len(token_judge_en_lower_ko_noun(word)) == 1: result.append([ token_judge_en_lower_ko_noun(word)[0], url, visit_count, user_count ]) elif len(token_judge_en_lower_ko_noun(word)) == 0: pass else: for token_noun in token_judge_en_lower_ko_noun(word): result.append( [token_noun, url, visit_count, user_count]) return result
from soynlp.tokenizer import RegexTokenizer from soynlp.noun import LRNounExtractor # import pandas as pd # import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt import matplotlib.font_manager as fm import re content = '밥도둑 지코바 양념치킨~~ 한번 우동 사리를 추가 해 먹어봤어요 강추드려요!!!! 지코바는 마무리로 치밥인거 알죠?' tokenizer = RegexTokenizer() tokened_content = tokenizer.tokenize(content) print(tokened_content) # def preprocessing(text): # text = re.sub('\\\\n', ' ', text) # return text # sentences = care['content'].apply(preprocessing) # sentences = preprocessing(content) # tokens = tokenizer.tokenize(content) # print(tokens) # print(sentences) fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) stopwords_kr = [ '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말', '너무'
#step3. 토큰화 참고 : https://linguistech.tistory.com/13 blog = pd.read_json('.blogreview.json') # print(df.count()) # print(df['title']) from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer tokenizer = RegexTokenizer() ##토큰화 테스트## sample_index = 5 sample_title = blog['title'][sample_index] sample_description = blog['description'][sample_index] # print(sample_title) # print(sample_description) tokened_title = tokenizer.tokenize(sample_title) tokened_description = tokenizer.tokenize(sample_description) # print(tokened_description) ##개행문자 제거## def preprocessing(text): text = re.sub('\\\\n', ' ', text) return text ##개행문자 제거## title_sentences = blog['title'].apply(preprocessing) description_sentences = blog['description'].apply(preprocessing) # print(title_sentences)
def review_cr(urll): from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import pandas as pd from bs4 import BeautifulSoup # In[37]: url = urll # In[38]: driver = webdriver.Chrome( 'C:/Users/multicampus/PycharmProjects/airbnb_bot/chromedriver') driver.implicitly_wait(3) driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') # In[39]: reviews = soup.find('div', { 'id': 'reviews' }).findAll('div', {'class': '_czm8crp'}) review_list = [] for review in reviews: review_list.append(review.string) print(review_list) # In[40]: df = pd.DataFrame(review_list, columns=['review']) # In[41]: from soynlp.tokenizer import RegexTokenizer, LTokenizer, MaxScoreTokenizer tokenizer = RegexTokenizer() tokenizer # In[42]: parsed_list = [] for i in df['review']: temp = tokenizer.tokenize(i) parsed_list.append(temp) df['review_parsed'] = parsed_list # print(df) # In[43]: STOP_WORDS = ['.', '(', ')', '!', '[', ']', '▣', '※'] # In[44]: def remove_stopwords(tokens): return [t for t in tokens if t not in STOP_WORDS] # In[45]: df['review_parsed'] = df['review_parsed'].apply(remove_stopwords) # In[118]: from collections import Counter from matplotlib import pyplot as plt faq_answer_parsed_lst = [ y for x in df['review_parsed'].to_list() for y in x ] counter = Counter(faq_answer_parsed_lst) counter.most_common(20) counter = counter.most_common(20) print(counter) return counter