class KoreaHelper(object): def __init__(self): from konlpy.tag import Mecab self.mecab = Mecab() def pos(self, phrase: Text): """ $ python -m sagas.ko.ko_helper pos '계획이' :param phrase: :return: """ return self.mecab.pos(phrase) def nouns(self, phrase: Text): """ $ python -m sagas.ko.ko_helper nouns '피자와 스파게티가' $ python -m sagas.ko.ko_helper nouns '계획이' :param phrase: :return: """ from sagas.nlu.transliterations import translits from sagas.ko.kwn_procs import kwn ns = self.mecab.nouns(phrase) rs = [] for w in ns: # ws = get_word_sets(w, 'ko') ws = kwn.get_synsets(w, first=True) if ws: rs.append({ 'spec': ws[0].name(), 'text': w, 'translit': translits.translit(w, 'ko'), 'definition': ws[0].definition() }) else: rs.append({ 'text': w, 'translit': translits.translit(w, 'ko'), }) return rs def translit(self, word): """ $ python -m sagas.ko.ko_helper translit '피자와 스파게티가' See also: procs-ko-konlpy.ipynb :param word: :return: """ from sagas.nlu.transliterations import translits for w, p in self.mecab.pos(word): expl = '_' if p in ('NNG', 'VV'): ws = get_word_sets(w, 'ko') if ws: expl = f"{ws['name']}({ws['definition']})" print(w, translits.translit(w, 'ko'), p, expl)
def text_analysis(): res = Response("block") res.headers["Access-Control-Allow-Origin"]="*" jsonData = request.get_json() resultData = dict() tokenizer = Mecab() print(tokenizer.pos(jsonData['text'])) resultData['result'] = tokenizer.pos(jsonData['text']) return json.dumps(resultData)
def load_data_and_labels2(file_name): positive_exams = [] negative_exams = [] positive_count = 0 negative_count = 0 exams = list(open(file_name, "r").readlines()) for s in exams: splited = s.split('\t') if splited[2] == '0\n': negative_exams.append(splited[1]) negative_count = negative_count + 1 elif splited[2] == '1\n': positive_exams.append(splited[1]) positive_count = positive_count + 1 else: print(splited[0], splited[1], splited[2]) mecab = Mecab() positive_result = [] for pp in positive_exams: one_str = mecab.pos(pp) str_result = '' for p in one_str: if p[1] in { 'NNG', 'NNP', 'NNB', 'NNBC', 'VA', 'VV', 'SL', 'SN', 'SY' }: str_result = p[0] + ' ' + str_result positive_result.append(str_result) positive_labels = [[0, 1] for _ in positive_result] negative_result = [] for pp in negative_exams: one_str = mecab.pos(pp) str_result = '' for p in one_str: if p[1] in { 'NNG', 'NNP', 'NNB', 'NNBC', 'VA', 'VV', 'SL', 'SN', 'SY' }: str_result = p[0] + ' ' + str_result negative_result.append(str_result) negative_labels = [[1, 0] for _ in negative_result] y = np.concatenate([positive_labels, negative_labels], 0) x_text = positive_result + negative_result return [x_text, y]
def make_question_mecab_tokens(question): """ 입력받은 질문에 대한 형태소 분석 진행 1. mecab 2. 조사 제거 3. 한 문장으로 다시 결합 """ # load mecab mecab = Mecab() # mecab 돌리기 que_mecab = mecab.pos(question[0]) # 조사 제거 morpheme = [ 'NNG', 'NNP', 'NNB', 'NNBC', 'NR', 'NP', 'VV', 'VA', 'VX', 'VCP', 'VCN', 'MM', 'MAG', 'MAJ', 'IC', 'SN' ] tmp = [] que_tokens = [] for t in que_mecab: if t[1] in morpheme: que_tokens.append(t[0]) if len(que_tokens) == 0: que_tokens.append('') # 한 문장으로 결합 que_tokens_str = [' '.join(que_tokens)] return que_tokens_str
def pre_process(self, json, istrain): mecab = Mecab() data = [] for cnt, article in enumerate(json): if cnt % 10000 == 0: print(cnt) text = bs(article["text"], "html.parser").text #title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])] #author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])] text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)] data.append({ #"title_pos": title_pos, #"title_pos_sentences" : " ".join(title_pos), #"author_pos": author_pos, #"author_pos_sentences" : " ".join(author_pos), "text":article["text"], "text_pos": text_pos, "text_pos_sentences" : " ".join(text_pos), #"forumid": article["forumid"], "pk": article["pk"] }) if istrain == True: data[cnt]["istroll"] = article["is_troll"] data = pd.DataFrame.from_dict(data) data = data.set_index('pk') return data
def voice2Text(): fname = r'/home/ubuntu/handypotter/v2t.txt' with open(fname, mode='r', buffering=-1, encoding="UTF-8") as fp: text = fp.read() fp.close mecab = Mecab() lemmatizer = Lemmatizer(dictionary_name='default') # 품사 구분하여 고유명사, 명사, 동사, 형용사 출력 tagged_list = mecab.pos(text) print(tagged_list) tags = ['NNP', 'NNG', 'NP', 'VV', 'VA', 'MAG', 'XR'] stoptags = [ 'JKS', 'SF', 'XSN', 'EC', 'EP', 'VX', 'NNB', 'EF', 'JX', 'EP+EF', 'XSV', 'XSA', 'XSN' ] sentence_token = [t[0] for t in tagged_list if t[1] in tags] print(sentence_token) return sentence_token
def analyzing_morphem(content_list): mecab = Mecab() for idx, doc in enumerate(content_list): if idx % 5000 == 0: print 'Morphem Analysis on %d' % idx yield ' '.join([part for part, pos in mecab.pos(doc.decode('utf-8')) ]).encode('utf-8')
def predict(estimator, data_file, voca): with open(data_file) as f: contents = f.read() mecab = Mecab() morps = mecab.pos(contents) morps = [morp[0] for morp in morps if morp[1] in TAGS] input_fn = build_input_fn([' '.join(morps)], labels=None, voca=voca, batch_size=1, num_epochs=1, shuffle=False) predict = estimator.predict(input_fn) cate_names = {v: k for k, v in CATES.items()} def second_cls(probs): tup = [(i, prob) for i, prob in enumerate(probs)] tup = sorted(tup, key=lambda x: x[1], reverse=True) return tup[1] for i, p in enumerate(predict): cls, probs = p['class'], p['prob'] name, prob = cate_names[cls], probs[cls] cls2, prob2 = second_cls(probs) name2 = cate_names[cls2] tf.logging.info("Prediction %s: %s(%.4f), %s(%.4f)"\ % (i+1, name, prob, name2, prob2 ))
def pos(): x = request.json #json 데이터를 받아옴 print(x) requestText = x['text'] # 형태소 분석할 텍스트 # ------------------------형태소 분석 로직--------------------------- m = Mecab() checked_sent = requestText # 오탈자 전처리 non_blank_checked_sent = checked_sent.replace(" ", "") # 공백 제거 temp_sent = pnu_spell_check(non_blank_checked_sent) # 일반적으로 공백 전처리 후 실행하는 것이 성능이 더 좋았지만 # 간혹 아닌 correction 값이 아예 나오지 않는 경우도 있어 해당 경우는 # 원래 공백이 있는 상태의 입력값으로 처리 if temp_sent != '': checked_sent = temp_sent else: checked_sent = pnu_spell_check(checked_sent) r = m.pos(checked_sent) print(r) # result = ''.join(r) # ------------------------형태소 분석 로직 끝--------------------------- return jsonify(result=r) # 받아온 데이터를 다시 전송
def view_post(request, pk): the_post = get_object_or_404(Post, pk=pk) the_comment = Comment.objects.filter(post=the_post) mecab = Mecab() morph = mecab.pos(the_post.content) the_morph = ' '.join(str(e) for e in morph) if request.method == 'GET': pass elif request.method =='POST': new_comment = Comment() new_comment.content = request.POST.get('content') new_comment.post = the_post new_comment.save() return render(request, 'view_post.html',{ 'post' : the_post, 'comments' : the_comment, 'morph' : the_morph, })
def mL(temp, temp1): model = load_model('./news_lstm_usev3.model') with open('./tokenizer_usev3.pickle', 'rb') as handle: tok = pickle.load(handle) tag_classes = ['NNG', 'NNP'] category = {0: '세계', 1: '코로나', 2: '사회', 3: '문화', 4: '정치', 5: 'IT과학', 6: '경제'} m = Mecab() data = crawler(temp, temp1) ind = len(data.index) json_list = {} while (ind): element = {} result_ml ="" element['title'] = str(data.loc[len(data.index) - ind]['title']) element['date'] = str(data.loc[len(data.index) - ind]['date']) element['contents'] = str(data.loc[len(data.index) - ind]['contents']) element['link'] = str(data.loc[len(data.index) - ind]['link']) value = m.pos((str(data.loc[len(data.index) - ind]['title']) + str(data.loc[len(data.index) - ind]['contents'])).strip()) for i in value: if i[1] in tag_classes and i[0] != '*': result_ml += i[0] + " " x = [result_ml.split()] sequence_data = tok.texts_to_sequences(x) pad_sequence_data = sequence.pad_sequences(sequence_data) element['probability'] = {} for idx, i in enumerate(model.predict(pad_sequence_data)[0]): element['probability'][category[idx]] = round((i * 100), 2) ind = ind - 1 json_list[len(data.index) - ind] = element return json_list
def getentity_slot(intent_idx, strbuf): mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') for intent_str, idx in intent_dic.items(): if idx == intent_idx: intent = intent_str break slot_value = story_slot_entity.get(intent) added = 0 M = mecab.pos(strbuf) ## 조사를 안쓰는 경우 피자 같은 단어를 명사로 해석안함.. -> 꼼수로 단어단위로 잘라서 해결 + mecab에 고유명사 추가해야함 # print(M) for pos_tag in M: if (pos_tag[1] in ['NNG', 'NNP', 'SL', 'MAG']): # 명사, 영어만 사용 for key in slot_value: if pos_tag[0] in entity_list[key]: # 메뉴 List 에서 검색 added = 1 if slot_value[key] is None: slot_value[key] = [pos_tag[0]] else: slot_value[key].append(pos_tag[0]) M = strbuf.split(' ') for pos_tag in M: for key in slot_value: if pos_tag[0] in entity_list[key]: # 메뉴 List 에서 검색 added = 1 if slot_value[key] is None: slot_value[key] = [pos_tag[0]] else: slot_value[key].append(pos_tag[0]) return added, slot_value
def tokenize(talk_dic): SW = define_stopwords("./stopwords-ko.txt") mecab = Mecab() total = {} for k, v in talk_dic.items(): total_sub = [] for idx, talk_set in enumerate(v): time, talk = talk_set clean_talk = message_cleaning(talk) tokenized_talk = [] for word, tag in mecab.pos(clean_talk): if len(word) == 1 and tag in [ 'EC', 'JX', 'ETM', 'JKS', 'JKB', 'XSV', 'JKO', 'XSV+EC', 'XSN', 'NNB', 'EP', 'JKG', 'VCP', 'NNB+JKS', 'JKG' ]: continue if word in SW and tag in [ 'EC', 'JX', 'ETM', 'JKS', 'JKB', 'XSV', 'JKO', 'XSV+EC', 'XSN', 'NNB', 'EP', 'JKG', 'VCP', 'NNB+JKS', 'JKG' ]: continue tokenized_talk.append((word, tag)) #talk_dic[k][idx] = (time, talk, tokenized_talk) total_sub.extend(tokenized_talk) total[k] = total_sub return total
class KorPreprocessor(PreprocessorBase): def __init__(self): super(KorPreprocessor, self).__init__() self.tagger = Mecab() def _to_morphs(self, s): return self.tagger.pos(s) @staticmethod def _clean(s): s = re.sub(r"[^가-힣ㄱ-ㅎ?.!,]", " ", s) s = s.strip() return s def preprocess(self, s): s = self._basic_nmt(s) s = self._clean(s) tagged = self._to_morphs(s) _s = [] for w, _ in tagged: _s.append(w) s = " ".join(_s) s = self._add_token(s) return s
class SearchCluster(): def __init__(self, app): self.app = app self.mecab = Mecab() self.load_models() def load_models(self): self.word2vec = gensim.models.Word2Vec.load_word2vec_format( WORD2VEC_MODEL, binary=True) self.cluster_pipe = joblib.load(PIPE_DUMPING) def __task_to_vector(self, task): words = [key for key, pos in self.mecab.pos(task)] # aggregation word vectors vector = np.mean(np.array( [self.word2vec[word] for word in words if word in self.word2vec]), axis=0) return vector def __predict_label(self, task): vector = self.__task_to_vector(task) return self.cluster_pipe.predict(vector)[0] def get_articles(self, user_id, task, topn=3): label = self.__predict_label(task) article_id_list = list( self.app.query_pool2.get_same_cluster_articles( user_id, label, topn)) return list( self.app.query_pool2.get_article_list_by_id(article_id_list))
def preprocessing(self): #2018038092 안준 print("\ndata preprocessing...\n") mecab = Mecab() stopwords = [] #불용어를 저장하기 위한 리스트 reader = csv.reader(self.stopwords_csv) for row in reader: stopwords.append(row) stopwords = sum(stopwords, []) #2차원 리스트를 1차원 리스트로 변환 self.stopwords_csv.close() for i in self.data.index: word_token = mecab.pos(self.data.at[i, 'content']) #줄거리를 품사별로 분리 filtering = [ x for x, y in word_token if y in ['NNG', 'NNP', 'VV', 'VA'] ] #일반명사, 고유명사, 동사, 형용사 #공백 제거 new_filtering = [i.replace(' ', '') for i in filtering] #불용어 처리 result = [] #전처리된 줄거리 for word in new_filtering: if word not in stopwords: #stopwords.csv 파일에서 불러옴. result.append(word) #dataframe에 결과물 저장 self.data.at[i, 'content'] = result #i번째 행 content열에 전처리 결과 대입 self.data.at[i, 'content'] = ' '.join( self.data.at[i, 'content']) #list는 토큰화할 수 없기 때문에 하나의 문자열로 결합 print('\ncomplete data preprocessing.')
def doc_to_stemmed_words(self): ''' 뉴스기사의 각 문장에서 추출한 단어의 어근들을 반환한다. :param text: 뉴스기사 텍스트 (string) :return: 각 문장에서 추출한 단어의 어근들의 리스트를 원소로 갖는 리스트 (nested list) ''' sentences = (self.text).split(".") #kkma = Kkma() #remove_pos = "[(?P<조사>JK.*)(?P<접속조사>JC.*)(?P<전성어미>ET.*)(?P<종결어미>EF.*)(?P<연결어미>EC.*)(?P<접미사>XS.*)(?P<마침표물음표느낌표>SF.*)(?P<쉼표가운뎃점콜론빗금>SP.*)]" #kkma mecab = Mecab() remove_pos = "[(?P<조사>JK.*)(?P<접속조사>JC.*)(?P<전성어미>ET.*)(?P<종결어미>EF.*)(?P<연결어미>EC.*)(?P<접미사>XS.*)(?P<마침표물음표느낌표>SF.*)(?P<쉼표가운뎃점콜론빗금>SC.*)]" # mecab stemmed_sentences = [] for sentence in sentences: # stemmed_words = kkma.pos(sentence) stemmed_words = mecab.pos(sentence) stemmed_words = [ x[0] for x in stemmed_words if not bool(re.match(remove_pos, x[1])) ] stemmed_sentences.append(stemmed_words) return stemmed_sentences
def convert_data(self) : """ augment data with entity list and pattern :return: None """ with codecs.open( self.pattern_data_path, "r", "utf-8" ) as fileObj : document = fileObj.readlines() return_arr = [] for i, line in enumerate(document) : words = [] if(self.use_mecab) : words = str(line).split(' ') else : mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = mecab.pos(line) for word, tag in pos: words.append(word) print("===={0} line job start".format(i)) match_keys = self._check_all_match(words) if(self.out_format_type == 'plain') : aug_data = self._aug_sent(match_keys, words, []) self._plain_formatter(aug_data) elif(self.out_format_type == 'iob') : aug_data = self._aug_sent(match_keys, words, []) self._iob_formatter(aug_data) else : raise Exception (' '.join(['not', 'plain', 'or iob'])) print("===={0} line job done".format(i))
def tag_pos(sentences, tagger='kkma'): """ Predict Part-of-Speech tag of input sentences PoS tagger: KKMA :param sentences: list of input sentences :return: tagged sentences """ if tagger == 'kkma': kkma = Kkma() elif tagger == 'mecab': mecab = Mecab() morph_lists = [] for sent in sentences: morph_list = [] if tagger == 'kkma': pos_tagged_sentences = kkma.pos(sent) elif tagger == 'mecab': pos_tagged_sentences = mecab.pos(sent) for (key, value) in pos_tagged_sentences: value = transform_pos(value, tagger) morph_list.append([key, value]) morph_lists.append(morph_list) return morph_lists
class Preprocess: def __init__(self, word2idx_dic="", userdic=None) -> None: if word2idx_dic != "": f = open(word2idx_dic, "rb") self.word_index = pickle.load(f) f.close() else: self.word_index = None if userdic is None: self.mecab = Mecab() else: self.mecab = Mecab(dicpath=userdic) self.exclusion_tags = [ "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC", "SF", "SP", "SS", "SE", "SO", "EP", "EF", "EC", "ETN", "ETM", "XSN", "XSV", "XSA", ] def pos(self, sent): return self.mecab.pos(sent) def get_keywords(self, pos, without_tag=False): f = lambda x: x in self.exclusion_tags word_list = [] for p in pos: if not f(p[1]): word_list.append(p if not without_tag else p[0]) return word_list def get_wordidx_sequence(self, keywords): if self.word_index is None: return [] w2i = [] for word in keywords: try: w2i.append(self.word_index[word]) except KeyError: w2i.append(self.word_index["OOV"]) return w2i
def nlp_function(self, str): mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic") pos_data = mecab.pos(str) res_str = "" for i in range(len(pos_data)): if pos_data[i][1][:2] == 'NN' or pos_data[i][1][:2] == 'VV': res_str += pos_data[i][0] + " " return res_str
def filter_by_pos(text, accepts=["NNG", "NNP", "NNB", "VA", "VV", "VX", "VCP", "SL", "SH"]): mecab = Mecab() temp = [] for pos in mecab.pos(text): if pos[1] in accepts: temp.append(pos[0]) del mecab return " ".join(temp)
def files_to_map(self, folder, visited_file): """ Get http list and make dictionary files dictionary take key and values, key is refering word and values is indicating numbers of word in files Args: foler is the directory which have a files """ m = Mecab() answer = {} #이미 넣은 파일들은 건너뛰게 하는 함수 visited = self.get_visited_file(visited_file) check = False try: with open('word_idf.json','r') as f: html = f.read() #print(visited) for item in self.get_file_list(folder): #print(item) if not os.path.isfile(item): continue if item in visited: continue encoding = ['utf-8', 'cp949'] for encode in encoding: try: with open(item,'r', encoding = encode) as f: html = f.read() html = re.sub('[^가-힣ㄱ-ㅎ ]',' ',html) html = re.sub(r'(.)\1+',r'\1\1',html) for word in m.pos(html): if word[1] not in ['NNP','NNG','NNB','VA']: continue answer.setdefault(word[0],1) answer[word[0]] = answer[word[0]] + 1 visited.append(item) check = True break except Exception as e: log.error('Files_to_map() Line = '+str(inspect.currentframe().f_lineno)+" Error: "+str(e)) if answer: try: #덮어쓰기가 아니라 추가로 올려야 한다. json.dump(answer,open('word_idf.json','w')) except Exception as e: log.error("Files_to_map() Line = " +str(inspect.currentframe().f_lineno)+" Error: "+str(e)) if check: try: with open(visited_file,'w',encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(visited) except Exception as e: log.error("Files_to_map Line = " +str(inspect.currentframe().f_lineno)+" Error: "+str(e)) return True
def ismenu(msg): mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') M = mecab.pos(msg) menu = [] for pos_tag in M: if (pos_tag[1] in ['NNG', 'NNP', 'SL', 'MAG']): # 명사, 영어만 사용 if pos_tag[0] in menu_list: # 메뉴 List 에서 검색 menu.append(pos_tag[0]) return menu
def pos_by_ISBN(self, contents): mecab = Mecab() pos_list = [] for col in self.collection_review.find({"ISBN" : contents},{"_id" : 0, "review_text" : 1}): pos = mecab.pos(col['review_text']) pos_list.append(pos) return pos_list
def _pos_tagger(self, input, type='mecab'): """ :param input: :return: """ if (type == 'mecab'): mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') return mecab.pos(str(input))
def tokenize_ko(lyrics_file_ko): print("\n-------- K-POP LYRICS --------") # lyrics_file_ko = "../crawl_data/lyrics_kr/kr_lyrics_verbose.csv" df = pd.read_csv(lyrics_file_ko) print(df.shape, "# as_is_kpop") df = df.dropna() print(df.shape, "# dropna()") df = df.drop_duplicates() print(df.shape, "# drop_duplicates()") data = list(df['Lyrics'].values) print("num_lyrics_kpop:", len(data)) # Load Korean stopwords. stopwords = ["하:VV", "있:VV", "되:VV", "있:VA", "이러:VV"] # Load Korean morphological analyzer. mecab = Mecab() word_list = [] for lyric in data: lyric = re.sub('[a-zA-z]', '', lyric) parsed = mecab.pos(lyric) tmp = [] for w, pos in parsed: # We look for four parts of speech. # See below URL for POS tags (Mecab-ko). # *** KoNLPy Korean POS Tag Comparison Chart *** # https://docs.google.com/spreadsheets/d/1OGAjUvalBuX-oZvZ_-9tEfYD2gQe7hTGsgUpiiBSXI8/edit#gid=0 if (pos == 'NNG') | (pos == 'NNP') | (pos == 'VV') | (pos == 'VA'): wpos = "{}:{}".format(w, pos) if wpos not in stopwords: tmp.append(wpos) word_list.append(tmp) # Save tokenized lyrics, which contains nouns, verbs, and adjectives, to a pickle file. with open("../data/tokeninzed_kpop.p", 'wb') as f: pickle.dump(word_list, f) print("word list kpop sample:", word_list[0]) flat_list = [item for sublist in word_list for item in sublist] print("total_kpop_words:", len(flat_list)) counts = Counter(flat_list) print("uniq_words_kpop:", len(counts)) # Save unique word list with frequency to file. with open("../data/uniq_words_freq_kpop.txt", 'w') as f: for k, v in counts.most_common(): f.write("{}\t{}\n".format(k, v))
def tokenize_ko(lyrics_file_ko): print("\n-------- K-POP LYRICS --------") # lyrics_file_ko = "../crawl_data/lyrics_kr/kr_lyrics_verbose.csv" df = pd.read_csv(lyrics_file_ko) print(df.shape, "# as_is_ko") df = df.dropna() print(df.shape, "# dropna()") df = df.drop_duplicates() print(df.shape, "# drop_duplicates()") data = list(df['Lyrics'].values) print("ko num of lyrics:", len(data)) # Load Korean stopwords. stopwords = ["하:VV", "있:VV", "되:VV", "있:VA", "이러:VV"] # Load Korean morphological analyzer. mecab = Mecab() morphs = [] for lyric in data: lyric = re.sub('[a-zA-z]', '', lyric) parsed = mecab.pos(lyric) tmp = [] for w, pos in parsed: # We look for four parts of speech # See below URL for POS tags (Mecab-ko) # *** KoNLPy Korean POS Tag Comparison Chart *** # https://docs.google.com/spreadsheets/d/1OGAjUvalBuX-oZvZ_-9tEfYD2gQe7hTGsgUpiiBSXI8/edit#gid=0 if (pos == 'NNG') | (pos == 'NNP') | (pos == 'VV') | (pos == 'VA'): wpos = "{}:{}".format(w, pos) if wpos not in stopwords: tmp.append(wpos) morphs.append(tmp) # Create 'processed' directory if there isn't any. processed_dir = "processed" if not os.path.exists(processed_dir): os.makedirs(processed_dir) with open("processed/word_list_ko.p", 'wb') as f: pickle.dump(morphs, f) flat_list = [item for sublist in morphs for item in sublist] print("total_ko_words:", len(flat_list)) counts = Counter(flat_list) print("uniq_words_ko:", len(counts)) with open("processed/uniq_word_ko.txt", 'w') as f: for k, v in counts.most_common(): f.write("{}\t{}\n".format(k, v))
def hello(): app = Flask(__name__) app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True contents = "ndllocvcv" from konlpy.tag import Mecab tagger = Mecab() t = tagger.pos("고양이는 양옹뉴턴야옹") print("========================================") return json.dumps(t, ensure_ascii=False)
def tokenize_n(doc): pos_tagger = Mecab() a = [] for t in pos_tagger.pos(doc): if (re.search(nnpattern, t[1]) != None): a.append('/'.join(t)) else: continue return a
def _pos_tagger(self, input, type='mecab'): if (type == 'mecab'): osx_path = '/usr/local/lib/mecab/dic/mecab-ko-dic' tumbleweed_path = '/usr/local/lib64/mecab/dic/mecab-ko-dic' mecab = Mecab(osx_path) return mecab.pos(str(input)) elif (type == 'twitter'): twitter = Twitter() return twitter.pos(str(input))
def getNVM(text: str): tokenizer = Mecab() parsed = tokenizer.pos(text) pos = [] tags = ['NNG', 'NNP'] for word in parsed: tag = word[1] if tag in tags: pos.append(word[0]) return pos
def main(): mecab = Mecab() if len(sys.argv) < 2: result = {'result':'none'} print json.dumps(result) sys.exit(0) morphem_list = mecab.pos(sys.argv[1].decode('utf-8')) result_dict = {} result_dict['result'] = [x[0].encode('utf-8') for x in morphem_list] print json.dumps(result_dict)
def _mecab_parse(self, str_arr, tag_combine=True): """ :param h5file: :return: """ mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') return_arr = [] for data in str_arr: return_arr = return_arr + self._flat(mecab.pos(str(data)), tag_combine=tag_combine) return return_arr
def _pos_raw_data(self, lt): """ :param lt: list type value :return: """ mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') return_arr= [] for raw in lt : pos = mecab.pos(raw) for word, tag in pos: return_arr.append("{0}/{1}".format(word, tag)) return return_arr
def _pos_tag_predict_data(self, x_input, word_len): """ :param x_input: :return: """ word_list = [] mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') for word_tuple in self._pad_predict_input(mecab.pos(x_input), word_len): if (len(word_tuple[1]) > 0): word = ''.join([word_tuple[0], "/", word_tuple[1]]) else: word = word_tuple[0] word_list.append(word) return word_list
def parse(self, data_path = "data"): file_list = glob.glob("%s/*.json" % data_path) json_list=[] shuffle(file_list) for json_file_name in file_list: json_file = json.loads(open(json_file_name).read()) json_list += json_file["articles"] mecab = Mecab() dataframe = [] for article in json_list: text = bs(article["text"], "html.parser").text title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])] author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])] text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)] dataframe.append({ "title_pos": title_pos, "title_pos_sentences" : " ".join(title_pos), "author_pos": author_pos, "author_pos_sentences" : " ".join(author_pos), "text":article["text"], "text_pos": text_pos, "text_pos_sentences" : " ".join(text_pos), "forumid": article["forumid"], "istroll": article["is_troll"], "pk": article["pk"] }) dataframe = pd.DataFrame.from_dict(dataframe) dataframe = dataframe.set_index("pk") return dataframe
def learning(request, pk): the_post = get_object_or_404(Post, pk=pk) mecab = Mecab() morph = mecab.pos(the_post.content) if request.method=="GET": pass elif request.method=="POST" and the_post.sentiword_set.exists()==False: for m in range(len(morph)): the_word = Sentiword() the_word.word = str(morph[m]) the_word.post = the_post the_post.senti = request.POST.get('senti') the_post.save() the_word.save() return redirect('view_post', pk=pk) else: return redirect('view_post', pk=pk) return render(request, 'learning.html',{ 'post':the_post, })
class SearchCluster: def __init__(self, app): self.app = app self.mecab = Mecab() self.load_models() def load_models(self): self.word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True) self.cluster_pipe = joblib.load(PIPE_DUMPING) def __task_to_vector(self, task): words = [key for key, pos in self.mecab.pos(task)] # aggregation word vectors vector = np.mean(np.array([self.word2vec[word] for word in words if word in self.word2vec]), axis=0) return vector def __predict_label(self, task): vector = self.__task_to_vector(task) return self.cluster_pipe.predict(vector)[0] def get_articles(self, user_id, task, topn=3): label = self.__predict_label(task) article_id_list = list(self.app.query_pool2.get_same_cluster_articles(user_id, label, topn)) return list(self.app.query_pool2.get_article_list_by_id(article_id_list))
def preprocess(args): """ Description Return - word2idx: Sequence of word index. It is 2-dim like [# of laws, # of words in each law]. - word_dict: Word to index mapping table. { word: idx } (Only contain VOCA_SIZE words) - word_inv_dict: Inverted version of word_dict. { idx: word } (Only contain VOCA_SIZE words) - word_count: Word counter of each laws. Only contain VOCA_SIZE words. """ tagger = Mecab() with open(args.input, "r") as reader: data = reader.read() # Sequence of words in each law. [num_laws, num_words] word_list = list() # Sequence of idx. [num_laws, num_words] word2idx = list() # Mapping table of word - idx. word_dict = dict() # Inversed mapping table of word - idx (for fast access). word_inv_dict = dict() # Word counter. word_count = list() """ Tag part-of-speech and remove unimportant words (like josa..). """ # Split each laws by <END> symbol. law_list = data.split("<END>") for law in law_list: # Eliminate special chars law = re.sub("[^a-zA-Z0-9가-힣 \n]", " ", law) # 1. Eliminate newline, tab and strange char. # 2. Split words by space. word_list.append(law.replace("\n", " ").replace("\t", " ").replace("\xa0" ,"").split(" ")) for i, v in enumerate(word_list): for j, word in enumerate(v): # Tag laws using Mecab tagger. and exclude some tags. tag = tagger.pos(word) excluded = [ t[0] for t in tag if not re.search("NN|XR", t[1]) ] # Exclude word if it contain number (ex. 제1조, 제1항의 경우 해당 단어 삭제). for t in tag: if t[1] == "SN": word_list[i][j] = "" # Reconstruct word_list by using excluded tag list. for e in excluded: word_list[i][j] = word_list[i][j].replace(e, "") word_list[i] = [ w for w in word_list[i] if len(w) > 1 or w == "법" ] # If last element of word_list is empty, remove it. if not word_list[-1]: word_list.pop() # Construct word counter. 1st element in counter is UNKOWN_WORD (simply UNK). word_count.append(["UNK", 0]) merged = list(itertools.chain.from_iterable(word_list)) word_count.extend(collections.Counter(merged).most_common(args.voca_size-1)) # Construct word mapping table. word_dict = { v[0] : i for v, i in zip(word_count, itertools.count(0)) } word_inv_dict = { i : v for v, i in word_dict.items() } # Make sequence of word-idx. for v in word_list: row = list() for word in v: idx = word_dict.get(word) if idx != None: row.append(idx) else: row.append(word_dict.get("UNK")) word_count[0][1] += 1 word2idx.append(row) word_list = None # dont use anymore word_dict = None # dont use anymore word_count = None # dont use anympre return np.array(word2idx), word_inv_dict
def analyzing_morphem(content_list): mecab = Mecab() for idx, doc in enumerate(content_list): if idx % 5000 == 0 : print 'Morphem Analysis on %d' % idx yield ' '.join([part for part, pos in mecab.pos(doc.decode('utf-8'))]).encode('utf-8')
class DataAugmentation : """ Data Augmentation Class for nlp mainly for create iob data with pattern and dict test = DataAugmentation() test.load_dict() test.convert_data() """ class ThreadCls(threading.Thread) : def __init__(self, obj, idx): threading.Thread.__init__(self) self.obj = obj self.idx = idx def run(self): for _ in range(self.obj.dict_sample_iter): self.obj.load_dict() self.obj.convert_data(self.idx) def join(self): threading.Thread.join(self) return True def __init__(self, conf): """ init parms need to mange teses parms on db """ self.aug_file_cnt = 0 self.use_mecab = conf.get("use_mecab") self.max_file_size = conf.get("max_file_size") #10M self.pattern_data_path = conf.get("pattern_data_path") self.augmented_out_path = conf.get("augmented_out_path") self.dict_path = conf.get("dict_path") self.out_format_type = conf.get("out_format_type") self.ner_dicts = {} self.gpu_use = True self.dict_sample_size = int(conf.get("dict_sample_size")) self.dict_sample_iter = int(conf.get("dict_sample_iter")) self.thread_num = int(conf.get("thread_num")) def run(self): """ run :return: """ job_list = [] for idx, _ in enumerate(range(self.thread_num)) : job_list.append(self.ThreadCls(self, idx)) for job in job_list: job.start() for job in job_list: job.join() def load_dict(self): """ load dict list from csv file :return: """ self.ner_dicts = {} df_csv_read = pd.read_csv(self.dict_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') df_csv_read = df_csv_read.sample(n=self.dict_sample_size) for col in df_csv_read.keys() : self.ner_dicts[col] = [] for val in list(set(df_csv_read[col])) : if (val == val and val != None) : self.ner_dicts[col].append(val) def _check_all_match(self, words) : """ check all matcing dict keys in ohter word entity keys :param words: sentence str :return: list contain keys """ match_keys = [] for word in words : word = word.replace('\n', '') if(word in list(self.ner_dicts.keys())) : match_keys.append(word) return match_keys #@autojit def _aug_sent(self, keys, pattern, return_aug_sent=[]) : """ function which actually augment sentences with given pattern and keys :param keys: entity keys :param pattern: sentence pattern :return: list of augmented sentence """ try : if (len(keys) > 0): key = keys[0] del keys[0] else : return return_aug_sent if (len(return_aug_sent) == 0): for word in self.ner_dicts[key] : line = [] for slot in pattern: for rep in ['\n', 'NaN'] : slot = slot.replace(rep, '') if(key in slot) : for wd in self.mecab.morphs(word): wd = wd.replace(' ', '') line.append((wd, key)) else : line.append((slot, 'O')) return_aug_sent.append(line) else : del_idx = [] for i, line in enumerate(return_aug_sent): for j, slot in enumerate(line): if (slot[0] == key): for word in self.ner_dicts[key]: line = return_aug_sent[i].copy() for z, slot in enumerate(line): if(slot[0] == key) : buffer = "" for wd in self.mecab.morphs(word) : wd = wd.replace(' ', '') if(len(buffer) > 0 ) : buffer = ''.join([buffer,' ', wd]) else : buffer = wd if (len(buffer) > 1 ): line[z] = (buffer, key) return_aug_sent.append(line) del_idx.append(i) for _ in del_idx: del return_aug_sent[0] return self._aug_sent(keys, pattern, return_aug_sent) except Exception as e : print("error on nlp data augmentation :{0}".format(e)) def _iob_formatter(self, aug_data, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test' , str(self.aug_file_cnt) , '.iob']) if(os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size) : with open(path, "a") as f : for line in aug_data : for word in line : related_words = word[0].split(' ') for tocken in related_words : f.write(''.join([tocken, ' ', word[1]])) f.write('\n') f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.iob']) with open(path, "w") as f : for line in aug_data : for word in line : related_words = word[0].split(' ') for tocken in related_words : f.write(''.join([tocken, ' ', word[1]])) f.write('\n') f.write('\n') def _plain_formatter(self, aug_data, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out']) if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size): with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out']) with open(path, "w") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write('\n') def _intent_formatter(self, aug_data, key, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv']) if (os.path.exists(path) == False) : with open(path, "w") as f : f.write('encode,decode\n') if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size): with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write(',') f.write(str(key)) f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv']) with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write(',') f.write(str(key)) f.write('\n') def convert_data(self, idx) : """ augment data with entity list and pattern :return: Nones """ try : if (self.out_format_type == 'intent'): self._conv_type_b(idx) else : self._conv_type_a(idx) except Exception as e : print("error log : {0}".format(e)) def _conv_type_b(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) : words = [] if (self.use_mecab): self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else: words = str(line).split(' ') match_keys = self._check_all_match(words) aug_data = self._aug_sent(match_keys, words, []) self._intent_formatter(aug_data, key, idx) if(i%100 == 0) : print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1 def _conv_type_a(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for line in df_csv_read['encode'].values: words = [] if(self.use_mecab) : self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else : words = str(line).split(' ') match_keys = self._check_all_match(words) if(self.out_format_type == 'plain') : aug_data = self._aug_sent(match_keys, words, []) self._plain_formatter(aug_data,idx) elif(self.out_format_type == 'iob') : aug_data = self._aug_sent(match_keys, words, []) self._iob_formatter(aug_data,idx) else : raise Exception (' '.join(['not', 'plain', 'or iob'])) if (i % 100 == 0): print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1 # da = DataAugmentation({ # "use_mecab": True, # "max_file_size": 100000000, # "pattern_data_path": "/hoya_model_root/aug/pattern.csv", # "augmented_out_path": "/hoya_model_root/aug/aug_0810/", # "dict_path": "/hoya_model_root/aug/dict.csv", # "out_format_type": "iob", # "dict_sample_size" : 3, # "dict_sample_iter" : 500, # "thread_num" : 8 # }) # da.run()
class crawl_community(): def __init__( self ): self.driver = webdriver.Firefox() self.classifier = cf.classifier() self.URLs = [] self.contexts = [] self.bag = utils.load_dictionary() self.tagger = Mecab() def __del__( self ): self.driver.quit() def _crawl_URL( self ): titles = [] # dynamic scrolling more_count = 0 while True: time.sleep(0.5) more = self.driver.find_element_by_id("real_more_page") if more.is_displayed(): if more.text == "더보기": more.click() more_count += 1 else: break else: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") if more_count >= self.scroll: break # get html source html = self.driver.page_source soup = BeautifulSoup(html) # crawl URL for c in soup.find_all("li"): # if items are from community if c.get("class") == ['realtimeitem', 'community']: href = c.find("a")["href"] self.URLs.append(href) title = c.find("a").get_text().strip() titles.append(title) # if items are from twitter elif c.get("class") == ['realtimeitem', 'twitter']: for s in c.find_all("span"): if s.get("class") == ['text', 'snsbody']: href = s['href'] self.URLs.append(href) titles.append("twitter") return titles def _exclude_short( self, text ): pos = self.tagger.pos(text) words = [ p[0] for p in pos ] is_in = False for b in self.bag[0]: if b[0] in words: is_in = True for b in self.bag[1]: if b[0] in words: is_in = True return not is_in def _crawl_dcinside( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["s_write"]: text = c.find_all("td")[0].get_text() text = text.strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["dcinside", title, text]) """ def _crawl_mlbpark( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("td"): if c.get("class") == ["G13"] and c.find_all("div"): div = c.find_all("div")[0] text = div.get_text() text = text.strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["mlbpark", title, text]) break """ def _crawl_twitter( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("p"): tag = c.get("class") if tag and "tweet-text" in tag: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude : self.contexts.append(["twitter", title, text]) def _crawl_todayhumor( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["viewContent"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["todayhumor", title, text]) """ def _crawl_clien( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) c = soup.find(id="writeContents") if c: text = c.get_text().strip().replace("\n", " ") if self._exclude_short: self.contexts.append(["clien", title, text]) def _crawl_bobaedream( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["bodyCont"]: text = c.get_text().strip().replace("\n", " ") if self._exclude_short: self.contexts.append(["bobaedream", title, text]) """ def _crawl_fomos( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["view_text"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["fomos", title, text]) break def _crawl_inven( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["powerbbsContent"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["inven", title, text]) def _crawl_instiz( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) c = soup.find(id="memo_content_1") if c: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["instiz", title, text]) def _crawl_ppomppu( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("td"): if c.get("class") == ["han"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["ppomppu", title, text]) # determine which URL comes from def _crawl_context( self, titles ): for i, url in enumerate(self.URLs): if "dcinside" in url: self._crawl_dcinside(url, titles[i]) #elif "mlbpark" in url: self._crawl_mlbpark(url, titles[i]) elif "todayhumor" in url: self._crawl_todayhumor(url, titles[i]) #elif "clien" in url: self._crawl_clien(url, titles[i]) elif "twitter" in url: self._crawl_twitter(url, titles[i]) #elif "bobaedream" in url: self._crawl_bobaedream(url, titles[i]) elif "fomos" in url: self._crawl_fomos(url, titles[i]) elif "inven" in url: self._crawl_inven(url, titles[i]) elif "instiz" in url: self._crawl_instiz(url, titles[i]) elif "ppomppu" in url: self._crawl_ppomppu(url, titles[i]) else: print(url) # classify sentiment for i, v in enumerate(self.contexts): vector = self.classifier.features(v[1]+v[2]) predict = self.classifier.predict(vector).tolist()[0] self.contexts[i].insert(0, predict) def crawl( self, query, scroll = 5 ): self.scroll = scroll self.query = query self.url = "http://search.zum.com/search.zum?method=realtime&option=accu&query="+query+"&cm=more" self.driver.get(self.url) titles = self._crawl_URL() self._crawl_context(titles) return self.contexts
class classifier(): # include POS, MAG, VX to handle negation POS = "NN|XR|VA|VV|MAG|VX" POS_IDX = ["NN", "VA", "VV", "XR"] # "못"은 따로 처리 NEG_PREV = [("아니하", "VX"), ("않", "VX"), ("없", "VA"), ("없이", "MAG")] NEG_NEXT = [("안", "MAG")] def __init__(self): # initalize Mecab tagger self.tagger = Mecab() # initalize regular expression self.exp = re.compile(self.POS, re.IGNORECASE) # load sentiment dictionary self.bag = utils.load_dictionary() # load model if exist with open("../Resources/models/model", "rb") as model_file: self.model = pickle.load(model_file) def handle_negation(self, words, counter): # construct index to negate word except "못" neg_idx = [] for neg in self.NEG_PREV: find = utils.find_dup_idx(words, neg) for item in find: if item-1 > -1: neg_idx.append(item-1) for neg in self.NEG_NEXT: find = utils.find_dup_idx(words, neg) for item in find: if item+1 < len(words): neg_idx.append(item+1) # handle "못~" for w in words: loc = w[0].find("못") if loc > 0 and w[1].find("VX"): neg_idx.append(loc-1) # handle "못" for w in words: loc = w[0].find("못") if loc > -1 and w[1].find("MAG"): # 긴 부정문 (못햇다, 못 했다..) if loc > 1 and words[loc-1][1].find("VV"): neg_idx.append(loc-1) # 짧은 부정 elif loc < len(words)-1: neg_idx.append(loc+1) # 한계: 못 생겼다 같은 경우는 이상하게 나옴 # negate word for i in neg_idx: if words[i] in self.bag[0]: try: idx = self.POS_IDX.index(words[i][1]) except ValueError: pass else: counter[idx] -= 1 counter[idx+4] += 1 elif words[i] in self.bag[1]: try: idx = self.POS_IDX.index(words[i][1]) except ValueError: pass else: counter[idx] += 1 counter[idx+4] -= 1 return counter def make_features(self, sentence, words): # feature vector: # [ pos_noun, pos_adj, pos_verb, pos_root, # neg_noun, neg_adj, neg_verb, neg_root ] counter = [0, 0, 0, 0, 0, 0, 0, 0] if not words: return counter for i, w in enumerate(words): # replace POS to sentiment dictionary type words[i] = list(words[i]) if words[i][1].find("NN") >= 0: words[i][1] = "NN" elif words[i][1].find("VA") >= 0: words[i][1] = "VA" elif words[i][1].find("VV") >= 0: words[i][1] = "VV" elif words[i][1].find("XR") >= 0: words[i][1] = "XR" elif words[i][1].find("VX") >= 0: words[i][1] = "VX" elif words[i][1].find("MAG") >= 0: words[i][1] = "MAG" words[i] = tuple(words[i]) # count frequency of sentiment words if words[i] in self.bag[0]: # positive try: idx = self.POS_IDX.index(words[i][1]) counter[idx] += 1 except ValueError: pass elif words[i] in self.bag[1]: # negative try: idx = self.POS_IDX.index(words[i][1]) counter[idx+4] += 1 except ValueError: pass counter = self.handle_negation(words, counter) return counter def features(self, article): # tagging article pos = self.tagger.pos(article) words = [ p for p in pos if self.exp.search(p[1]) ] # construct data sets data = self.make_features(article, words) # normalize features arr = np.array(data, dtype=float) scaled = preprocessing.scale(arr).tolist() data = scaled return data def predict(self, vector): return self.model.predict(vector)
class keyword_anaylze(): def __init__( self, date, news_limit = 5, net_limit = 50 ): self.section = util.load_file("section.txt") self.date = date self.news_limit = news_limit self.net_limit = net_limit self.refer = 0 self.mecab = Mecab() self.exp = re.compile("NN|XR|VA|VV|MAG|VX") self.temp_net = {} self.temp_list = {} self.word_net = [] # relative word and its frequency self.word_list = [] # total word and its frequency (using for PMI) self.news = [] # top # of news self.sentiment = [0, 0] # [neg, pos] self.counter = [ 0 for i in range(16) ] def _add_news( self, context, url, title ): if len(self.news) < self.news_limit: self.news.append([len(context), url, title]) self.news.sort() else: self.news[0] = [len(context), url, title] self.news.sort() def _add_word( self, words, word_list, senti ): for w in words: if len(w) < 2: continue if w in word_list: word_list[w][0] += 1 word_list[w][int(senti)+1] += 1 else: word_list[w] = [1, 0, 0] word_list[w][int(senti)+1] += 1 def _make_morp( self, context ): context = re.sub(r"(\"|\')", "", context) words = re.findall(r"[\w']+", context) for i, v in enumerate(words): pos = self.mecab.pos(v) w = [ p[0] for p in pos if not re.search("NN|XR|VA|VV|MAG|VX|SL|SN", p[1]) ] for x in w: words[i] = words[i].replace(x, "") # remove '' in words return [ w for w in words if not w == "" ] def _arrange_word_list( self, dictionary ): words = sorted(dictionary.items(), key=itemgetter(1), reverse=True) word_list = [] for w in words: pos = self.mecab.pos(w[0]) if re.search("NN|XR", pos[0][1]): word_list.append(w) return word_list def _traverse_news( self, keyword ): global news_loc keyword_list = keyword.split(" ") for s in self.section: idx = 0 loc = news_loc+self.date+"/"+s print(loc+"/") while os.path.isfile(loc+"/"+str(idx)): f = open(loc+"/"+str(idx), "r") senti = f.readline().replace("\n", "") url = f.readline().replace("\n", "") title = f.readline().replace("\n", "") context = f.read().replace("\n", "") words = self._make_morp(context) f.close() self._add_word(words, self.temp_list, senti) is_key = True for key in keyword_list: have_word = False for w in words: if key in w: have_word = True if not have_word: is_key = False if is_key: self.counter[0+int(senti)] += 1 self.refer += 1 self.sentiment[int(senti)] += 1 self._add_news(context, url, title) self._add_word(words, self.temp_net, senti) idx += 1 def _traverse_community( self, keyword ): global community_loc base_loc = community_loc+keyword+"/" idx = 0 print(base_loc) while True: loc = base_loc+str(idx) idx += 1 if not os.path.isfile(loc): break f = open(loc, "r") senti = f.readline().replace("\n", "") comm = f.readline().replace("\n", "") title = f.readline().replace("\n", "") context = f.read().replace("\n", "") words = self._make_morp(context) f.close() self.sentiment[int(senti)] += 1 self._add_word(words, self.temp_list, senti) self._add_word(words, self.temp_net, senti) # determine community if comm == "dcinside": self.counter[2+int(senti)] += 1 elif comm == "todayhumor": self.counter[4+int(senti)] += 1 elif comm == "twitter": self.counter[6+int(senti)] += 1 elif comm == "fomos": self.counter[8+int(senti)] += 1 elif comm == "inven": self.counter[10+int(senti)] += 1 elif comm == "instiz": self.counter[12+int(senti)] += 1 elif comm == "ppomppu": self.counter[14+int(senti)] += 1 def _make_word_net( self ): network = [] words = [] count = [] for v in self.word_net: words.append(v[0]) count.append(v[1][0]) for i, v in enumerate(self.word_list): for j, w in enumerate(words): if v[0] == w and v[1][0] > 10: senti = v[1][2] / v[1][0] pmi = count[j] / v[1][0] network.append([w, senti, v[1][0], pmi]) return network def anaylze( self, keyword ): self._traverse_news(keyword) self._traverse_community(keyword) # sort word_net self.word_net = self._arrange_word_list(self.temp_net) if len(self.word_net) > self.net_limit: self.word_net = [ self.word_net[i] for i in range(self.net_limit) ] # sort word_list self.word_list = self._arrange_word_list(self.temp_list) # network = [ [word, senti, frequency, PMI] .. ] network = self._make_word_net() return self.sentiment, self.news, network, self.counter
def language_processing(input_data): mecab = Mecab() # 명사에 대한 yn 데이터 저장 # 날개가 있을 경우, check_data['날개'] == 1 check_data = dict() for name in [input_neuron.name for input_neuron in InputLayer.all_neuron]: # 우선 check_data 의 모든 데이터를 모른다는 조건으로 초기화 check_data[name] = 0 # [*range(3)] is same with [0, 1, 2] word_list, pos_list = zip(*[(word, pos) for word, pos in mecab.pos(input_data) if pos in ['VV', 'VA', 'NNG', 'JC', 'SC', 'MAG', 'VX']]) # 이미 처리한 word 데이터를 False 로 바꾸기 위해 # 데이터 변경을 지원하는 리스트로 형 변환. (기존에는 tuple) word_list = list(word_list) # 같은 이유 pos_list = list(pos_list) # 부정적인 성분 부사를 가지고 있는 형용사를 치환 # 날개가 안 보인다 --> 날개가 없다 yn_dict = { '있': 1, '들리': 1, '보이': 1, '없': -1, '모르': 0 } """ for index in range(len(pos_list)): if pos_list[index] == 'MAG' and word_list[index] == '안': # 성분 부사 이면서 부정 부사 일 경우 word_list[index] = '없' # 부정으로 치환 for i in range(len(pos_list[index:])): # 부정 부사 뒷 부분 탐색 if pos_list[i] in ['VV', 'VA']: # '있', '없' 등의 데이터가 나올 경우 try: word_list[i] = yn_change[word_list[i]] # yn_change 를 이용해 반전시킨다 except KeyError: word_list pass """ # 형용사를 먼저 탐색하고, 주변 명사를 그룹화 하는 방식으로 처리한다. # pos 데이터 중에서 있,없 등의 수식어를 가져옴 for index in range(len(pos_list)): if pos_list[index] == 'MAG' and word_list[index] == '안': # 성분 부사 이면서 부정 부사 일 경우 word_list[index] = '없' # 부정으로 치환 pos_list[index] = 'VA' # pos 데이터도 맞게 변경 if pos_list[index] in ['VA', 'VV']: # if pos is yn data # 해당 명사에 서술한 내용에 따라 InputLayer Neuron 에 입력함 try: yn = yn_dict[word_list[index]] except KeyError: yn = 0 finally: # 뒤에 부정적인 보조용언이 올 경우 # ex) ~하지 '않'는다 # 다음 인덱스 부터 탐색 tmp_index = index + 1 while tmp_index < len(pos_list): if pos_list[tmp_index] == 'VX': if word_list[tmp_index] == '않': yn *= -1 break elif pos_list[tmp_index] == 'NNG': break # 다음 명사가 나오면 종료 tmp_index += 1 # 그 전까지의 모든 명사를 위 yn 데이터로 저장 for nng in [word_list[i] for i in range(index) if pos_list[i] == 'NNG']: # 이미 처리한 word 일 경우 if nng is False: continue else: try: check_data[nng] except KeyError: pass else: check_data[nng] = yn # 처리한 word 들은 False 으로 치환. word_list[:index] = ([False] * index) return check_data
learning_rate = 0.001 dim_embed = 200 n_epochs = 20 window_size = 5 min_count = 3 wiki_file = '../text/wiki_all' with open( wiki_file ) as f: wiki_contents = f.read() wiki_docs = map(lambda x: filter(lambda y: y != '', x.text.split('\n')), BeautifulSoup( wiki_contents ).find_all('doc')) wiki_paragraphs = [item for sublist in wiki_docs for item in sublist] paragraph_list = [] for wiki_paragraph in wiki_paragraphs: wiki_paragraph_pos = map(lambda x: x[0] + '^/'+ x[1], mecab.pos( wiki_paragraph )) if len(wiki_paragraph_pos) > 2: paragraph_list.append( wiki_paragraph_pos ) del wiki_paragraphs word2vec_model = Word2Vec( size=dim_embed, alpha=learning_rate, min_count=min_count, workers=-1 ) word2vec_model.build_vocab( paragraph_list ) for epoch in range( n_epochs ): print "Training Epoch:", epoch word2vec_model.train( paragraph_list ) word2vec_model.alpha *= 0.99 word2vec_model.save('../../models/word2vec')