def search_book_by_title(self, title, n = 10) : words = nlp_module.pos_Kkma(title) words = set(words) appearance_list = [] for word in words : if (word[1][0] == 'N' or word[1] == 'OL') and\ word[0] in self.wordtobookindex.keys(): print(word) #print(self.wordtobookindex[word[0]]) appearance_list.extend(self.wordtobookindex[word[0]]) #print(appearance_list) counted_list = Counter(appearance_list) sortedlist = sorted(counted_list.items(), key=lambda x : x[1], reverse=True) return sortedlist[:n]
def init_word_index(self, book_set) : self.wordtobookindex = defaultdict(list) for i in range(len(book_set)) : book = book_set[i] words = nlp_module.pos_Kkma(book.title) words = set(words) for word in words : if word[1][0] == 'N' or word[1] == 'OL': #print(word) self.wordtobookindex[word[0]].append(i) dictlist = ["'{}' : {}".format(key, [(idx, book_set[idx].title) for idx in self.wordtobookindex[key]]) for key in self.wordtobookindex.keys()] for i in range(len(dictlist)) : print(dictlist[i]) print("index init finished")
def tokenize(sentences): sen_nlp = nlp_module.pos_Kkma(sentences) result = [] ignore_word = [ 'extreme', 'novel', 'j', 'l', 'nt', '노', '라이트', '시리즈', '제', '노', '일본', '의', '벨', '회', '노와', '리', '장', '마', '라', '히', '을', '토', '팬', '전', '탄', '오', '말', '메', '레', '로', '타', '바', '가', '서', '루', '디', '데', '요', '하', '들', '카', '키', '미', '코', '유', '속', '우', '사', '르', '출간', '사이트', '독자', '세', '작품', '회', '편', '완결', '신작', '호평', '누', '연재', '저자', '원작', '주', '단편', '단편집', '국내', '장려상', '발행', '최신작', '모', '사토', '와의', '중', '책갈피', '초판', '한국', '투', '황', '모토', '니스', '마코토', '나미', '통합', '루가', '이지만', '마사', '맥', '토모', '이슈', '전개' ] for chunk in sen_nlp: if chunk[1] == 'NNG' or chunk[1] == 'NNP': #or chunk[1] == 'VV' or chunk[1] == 'VA': word = chunk[0].lower() if not (word in ignore_word): result.append(word) return set(result)