def wordRank(): #Retreive text from elasticsearch results = es.get(index='nkdb', doc_type='nkdb', id='5dc9fc5033ec463330e97e94') texts = json.dumps(results['_source'], ensure_ascii=False) # split the text by sentences sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', texts) # normalize the text texts = [normalize(text, number=True) for text in sentences] wordrank_extractor = KRWordRank( min_count=3, # Minimum frequency of word max_length=10, # Maximum length of word verbose=True) beta = 0.85 # Decaying factor beta of PageRank max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) result = [] dic = {} # Make a dictionary [word, weight] for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: dic["y"] = r dic["label"] = word result.append(dic) dic = {} return json.dumps(result, ensure_ascii=False)
def normalizeCF(input_fname, output_fname): texts = get_texts(input_fname) with open(output_fname, 'w', encoding='utf-8') as f: for text in texts: text = normalize(text, english=True, number=True) text0 = text # # # 명사 추출 1 번 noun_text = expect_noun_text(text) text1 = ' '.join(noun_text) text = text0 + text1 # # # # 명사 추출 2 번 # noun_text = extract_noun.findKoNoun(text) # noun_text_list = noun_text[0] + noun_text[1] # text2 = ' '.join(noun_text_list) # text = text0 + ' ' + text1 + ' ' + text2 if text.strip() == '': continue print('*' * 10, text) f.write('%s\n' % (text)) return (texts)
def preprocessing_text(texts): #초기 실행시 아래 pip 다 실행해야함! # !pip install git+https://github.com/ssut/py-hanspell.git # !pip install konlpy # !pip install krwordrank #오류 발생시 : https://data-scientist-brian-kim.tistory.com/79 침고 print("preprocessing_text") from hanspell import spell_checker from tqdm.notebook import tqdm from konlpy.tag import Twitter from collections import Counter from krwordrank.hangle import normalize nlpy = Twitter() # lines = [line.rstrip('\n') for line in texts] lines = texts.splitlines() nouns_word = [] #명사 단어 추출 normalized_lines = [] for each_line in tqdm(lines): each_line = each_line.replace("\x0c", "") #json을 로드 하면서 생기는 특수문자 제거 each_line = normalize(each_line, english=True, number=True) #특수문자 제거 each_line = spell_checker.check(each_line).checked #맞춤법 틀린게 있다면 고쳐줌 nouns_word = nouns_word + nlpy.nouns(each_line) # 명사 단어 추출 normalized_lines.append(each_line) return lines, nouns_word, normalized_lines
def load_comments(movie_id_list): ### 모든 영화의 댓글 저장 comments_list = [] # 모든 댓글 영화id별로 database에서 호출 for idx in movie_id_list[:]: # db = pymysql.connect(host='localhost', port=3306, db='project_db', user='******', passwd='python', # charset='utf8') db = pymysql.connect(host='localhost', port=3306, db='recommend_db', user='******', passwd='1234', charset='utf8') query = 'select comment from movieapp_comment where movie_id={}'.format( idx) try: # select, update with db.cursor() as cursor: cursor.execute(query) result_list = cursor.fetchall() # cursor()의 값을 가져온다. finally: db.close() texts = [row[0] for row in result_list] # 영어, 한글, 숫자만 texts = [normalize(text, english=True, number=True) for text in texts] comments_list.append(texts) print('comments_list 개수: ', len(comments_list)) return comments_list
def keyword_normalize(lyricData): lyrics = [] for data in lyricData: texts = [] lyric = data[1].split('\n') for l in lyric: if not bool(l.strip()): continue texts.append(normalize(l)) texts = list(filter(lambda v: v and v != '중국어 병음', texts)) lyrics.append({'album_id': data[0], 'lyric': texts}) return lyrics
def get_keywords(title, text): """ :param title: title of article :param text: body of article :return: key_words """ texts = text texts = [texts] texts = [normalize(text, english=True, number=True) for text in texts] wordrank_extractor = KRWordRank( min_count=2, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) # rank 이용 분류 tagger = Komoran() stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')]) keyword_list = [] for i in keywords: noun = tagger.nouns(i) if noun != []: keyword_list.append([noun[0], keywords[i]]) keywords = [] for i in keyword_list[:5]: keywords.append(i[0]) title_keywords = [] for j in keywords: if j in title: title_keywords.append(j) for i in title_keywords: if i in stopword_list: title_keywords.remove(i) return title_keywords
def test_normalize(): input_str = '한글과 alphabet 으로 이뤄진 20글자에 가까운.. 문장이에요' form = '\npassed case: {}\ninput : {}\noutput: {}' settings = [('Hangle', False, False, False, '한글과 으로 이뤄진 글자에 가까운 문장이에요'), ('Hangle + English', True, False, False, '한글과 alphabet 으로 이뤄진 글자에 가까운 문장이에요'), ('Hangle + English + Number', True, True, False, '한글과 alphabet 으로 이뤄진 20글자에 가까운 문장이에요'), ('Hangle + English + Number + Punctuation', True, True, True, '한글과 alphabet 으로 이뤄진 20글자에 가까운.. 문장이에요')] for name, english, number, punctuation, expected in settings: pattern = initialize_pattern(english, number, punctuation, remains=None) output_str = normalize(input_str, pattern=pattern) assert output_str == expected message = form.format(name, input_str, output_str) print(message)
def keyword_extraction(txt): list_str = txt.split() list_file = [] for line in list_str: list_file.append(line) texts = list_file texts = [normalize(text, english=True, number=True) for text in texts] wordrank_extractor = KRWordRank( min_count=5, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:5]: result_list.append(word)
def _construct_word_graph(self, docs): def normalize(graph): graph_ = defaultdict(lambda: defaultdict(lambda: 0)) for from_, to_dict in graph.items(): sum_ = sum(to_dict.values()) for to_, w in to_dict.items(): graph_[to_][from_] = w / sum_ graph_ = {t: dict(fd) for t, fd in graph_.items()} return graph_ graph = defaultdict(lambda: defaultdict(lambda: 0)) for doc in docs: tokens = doc.split() if not tokens: continue links = [] for token in tokens: links += self._intra_link(token) if len(tokens) > 1: tokens = [tokens[-1]] + tokens + [tokens[0]] links += self._inter_link(tokens) links = self._check_token(links) if not links: continue links = self._encode_token(links) for l_node, r_node in links: graph[l_node][r_node] += 1 graph[r_node][l_node] += 1 # reverse for inbound graph. but it normalized with sum of outbound weight graph = normalize(graph) return graph
df # In[2]: df['TEXT'] # In[3]: texts = df['TEXT'].values.tolist() print(texts[0]) # In[4]: from krwordrank.hangle import normalize texts = [normalize(str(text), english=True, number=True) for text in texts] # In[5]: from krwordrank.word import KRWordRank wordrank_extractor = KRWordRank( min_count=5, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
def normalize(self, text): text = self.remove_keyboard_out_chractor(text) text = normalize(text, english=True, number=True, punctuation=True) return text
def normalizeText(text): text = normalize(text, english=True, number=True, punctuation=True) return text
def __get_splited_sentence(whole_text, sentence_tokenizer="re"): """ :param whole_text: 기사의 원문 :param sentence_tokenizer: 어떤 tokenizer를 사용할 것인지 re : . ? ! 3개를 기준으로 분리 enter : 엔터로 구분 jum : . 하나로 구분 kss : kss 토크나이저 사용 만약 특수기호가 있을 경우 특수기호 분류 후 kss 사용 :return: """ # print(whole_text) # 처음 입력이 리스트일 경우 if type(whole_text) == list: splited = [ normalize(sentence, english=True, number=True) for sentence in whole_text ] splited_num = len(splited) return splited, splited_num # 문자열에 특수기호가 있을 경우 special_character = ['○', '□', '▣', '※', '①', '②', '③', '◇', '●', '★', '-'] special_counter = 0 for special in special_character: special_counter += whole_text.count(special) if special_counter > 0: special_splited = re.split("[○●□▣※◇①②③★-]", whole_text.replace("\n", "")[:-1]) splited = [] for sentence in special_splited: sentence = kss.split_sentences(sentence) if type(sentence) == list: for sentence_splited in sentence: splited.append(sentence_splited) else: splited.append(sentence) sentence_tokenizer = "special_character" # print("TEST ", splited) if sentence_tokenizer == "re": splited = re.split("[.!?]", whole_text.replace("\n", "")[:-1]) elif sentence_tokenizer == "kss": splited = kss.split_sentences(whole_text.replace("\n", "")) elif sentence_tokenizer == "enter": # 아직 구현중, normalize 부분으로 찾으면 될거같은데... splited = whole_text.split("\n") elif sentence_tokenizer == "jum": splited = whole_text.split(".") elif sentence_tokenizer == "kss + re": kss_splited = kss.split_sentences(whole_text.replace("\n", "")) splited = [] for sentence in kss_splited: print("kss1: ", sentence) re_splited_sentences = re.split("[.!?]", sentence.replace("\n", "")[:-1]) if type(re_splited_sentences ) == list and len(re_splited_sentences) > 1: print("if: ", re_splited_sentences) for re_splited_sentence in re_splited_sentences: splited.append(re_splited_sentence) else: print("else: ", sentence) splited.append(sentence) print("rekss : ", splited) splited = __get_limited_length_sentence(splited, 5, 600) keyword_splited = [ normalize(sentence, english=True, number=False) for sentence in splited ] print(len(splited), splited) splited_num = len(splited) return splited, keyword_splited, splited_num
from hanspell import spell_checker from tqdm.notebook import tqdm from konlpy.tag import Twitter from collections import Counter from krwordrank.hangle import normalize nlpy = Twitter() lines = [line.rstrip('\n') for line in texts] #txt 파일을 개행문자 기준으로 splig nouns_word = [] #명사 단어 추출 normalized_lines = [] for each_line in tqdm(lines): each_line = each_line.replace("\x0c", "") #json을 로드 하면서 생기는 특수문자 제거 each_line = normalize(each_line, english=True, number=True) #특수문자 제거 each_line = spell_checker.check(each_line).checked #맞춤법 틀린게 있다면 고쳐줌 nouns_word = nouns_word + nlpy.nouns(each_line) # 명사 단어 추출 normalized_lines.append(each_line) # In[8]: normalized_lines # In[9]: # 명사의 빈도수 계산 from collections import Counter count = Counter(nouns_word) tag_count = []
□ 자동차 압류내역 조회 인터넷 사이트(열람 무료) ⊙ 정부24 [ https://www.gov.kr/portal/main ] - 자동차 등록원부등본(초본) 발급·열람신청 □ 차량번호로 전국의 주정차위반 과태료 조회 인터넷 사이트 ⊙ 위택스( www.wetax.go.kr/ ) ☞ 납부하기 ☞ 지방세외수입 ☞ 차량번호 조회 □ 경찰서의 속도 및 신호위반 과태료(범칙금) 조회 및 납부 인터넷사이트 ⊙ 경찰청교통민원24 [ 이파인 https://www.efine.go.kr/ ☎ 182 ]""" okt_test = Okt() splited = gisa.split("\n\n") splited = [normalize(text, english=True, number=True) for text in splited] # splited = kss.split_sentences(gisa.replace("\n", "")) # splited = gisa.replace("\n", "").split(".")[:-1] # splited = re.split("[.!?] ", gisa.replace("\n", "")[:-1]) split_num = len(splited) print(split_num) texts = [normalize(text, english=True, number=True) for text in splited] print(texts, len(texts)) wordrank_extractor = KRWordRank( min_count=5, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True