def getTfidfRaw(Numdoc): import gensim from gensim import corpora from gensim.models import TfidfModel from common import prs print("전처리 예시") #(docTitle, tokenized_doc)= prs.readyData(Numdoc) ##현재 5개 문서 호출 (docId, docTitle, tokenized_doc) = prs.readyData(Numdoc) #로 바뀔 예정 #분리된 데이터를 dictionary화 dct = corpora.Dictionary(tokenized_doc) #코퍼스에 담음 corpus = [dct.doc2bow(line) for line in tokenized_doc] #tfidf 모델에 돌림 tfmodel = TfidfModel(corpus) #벡터에 담음 vector = tfmodel[corpus[0]] #[(0, 0.004840388191324659), (1, 0.01275300075896571)... 의 형태 #print(vector) sortTF = [] from operator import itemgetter for i, topic_list in enumerate(tfmodel[corpus]): topic_list = sorted(topic_list, key=itemgetter(1), reverse=True) #print(i,'번째 문서의 TF/IDF 정렬',topic_list) sortTF.append((i, topic_list)) #idf 값 깔끔하게 하는 용도 import numpy as np resultTF = [] #[n][1]을 하면 그 문서의 형태소 숫자를 알 수 있다. ##현재 3번째 문서는 빈문서로 되어있음 유의 print(sortTF) for i, section in sortTF: section = sortTF[i] mainTF = [] print(i, "번째 문서의 단어 수 : ", len(section[1])) for wordid, value in section[1]: #print(dct[j],"-",np.around(value, decimals=5)) mainTF.append((dct[wordid], np.around(value, decimals=5))) resultTF.append((i, mainTF)) with open(DIR_HomeGraph, 'w', -1, "utf-8") as f: json.dump(resultTF, f, ensure_ascii=False) return resultTF
def LDA(ndoc, nit=NUM_ITER, ntp=NUM_TOPICS): # change global value if get new params. global NUM_ITER global NUM_TOPICS if NUM_ITER != nit: NUM_ITER = nit if NUM_TOPICS != ntp: NUM_TOPICS = ntp print("LDA Algo 시작!") print( "##########Pahse 0 : LDA option:##########", "\nDOWNLOAD OPTION : ", str(DOWNLOAD_DATA_OPTION), # "\nBACKEND CONNECTION OPTION : ", str(BACKEND_CONCT), # "\nRANDOM ORDER OPTION : ", str(RANDOM_MODE) ) # Phase 1 : READY DATA print("\n\n##########Phase 1 : READY DATA##########") (doc_id, titles, tokenized_doc, contents) = prs.readyData(ndoc, True) # LDA 알고리즘 print("\n\n##########Phase 2 : LDA Algo##########") result = runLda(titles, tokenized_doc, contents) if DOWNLOAD_DATA_OPTION == True: with open(LDA_DIR_FE, 'w', -1, "utf-8") as f: json.dump(result, f, ensure_ascii=False) showTime() if DOWNLOAD_DATA_OPTION == True: print("Analysis Result has been stored at ", LDA_DIR_FE) print("LDA Analysis Fin!") return result
from common import prs ndoc = 1000 prsResult = prs.readyData(ndoc, True) import pandas as pd data = pd.DataFrame(list(prsResult), index=["id", "content", "token", "contents"]).T from tensorflow import keras model = keras.models.load_model('tib_topic_model') for i in range(data.shape[0]): data.loc[i, "token"] = " ".join(data["token"][i]) topicDummy = pd.read_csv('./topicDummy.csv') data["topic"] = None topicDummy = topicDummy.drop(topicDummy.columns[0], axis=1) print(topicDummy.columns) import numpy as np topicList = topicDummy.columns from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences MAX_NB_WORDS = 5000 MAX_SEQUENCE_LENGTH = 500 #tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters = " ") import pickle # loading with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle)
import os from pathlib import Path curDir = os.getcwd() curDir = Path(curDir) homeDir = curDir.parent.parent import sys sys.path.append(str(homeDir)) from common import cmm from common import esFunc from common import prs # rawCorpus = esFunc.esGetDocs(30) data = prs.readyData(5) import json with open("tokened_history.json", 'w', -1, "utf-8") as f: json.dump(data[2], f, ensure_ascii=False) # print(len(result[2])) # tuple(doc id,title?,tokens) # each element number = num of doc # print(type(result[2][0])) # print(len(result[2][0])) # arr = [] # for tokenized_doc in result[2]: # for token in tokenized_doc:
def lstm(ndoc, db): #if pre process docs again... # True : ask ES server and preprocess again, False: load lastest prs result. prs 시간 오래 걸림 if True: from pathlib import Path import os curDir = os.getcwd() curDir = Path(curDir) homeDir = curDir.parent import sys sys.path.append(str(homeDir)) from common import prs prsResult = prs.readyData(ndoc) import pandas as pd data = pd.DataFrame(list(prsResult), index=["docID", "docTitle", "token"]).T for i in range(data.shape[0]): data.loc[i, "token"] = " ".join(data["token"][i]) else: import json with open('../latestPrsResult/latest_prs_result3000.json', 'r') as f: data = json.load(f) data_ = { "docID": data["idList"], "docTitle": data["titles"], "token": data["tokenized_doc"] } import pandas as pd df = pd.DataFrame.from_dict(data_) df_token = df.drop(df[df["token"].map(len) < 1].index) df_token = df_token.reset_index(drop=True) for i in range(len(df_token)): df_token.loc[i, "token"] = " ".join(df_token["token"][i]) data = df_token ndoc = len(data) print("number of docs : " + str(ndoc)) # 분류 학습 결과 매핑 더미 topicDummy = pd.read_csv('./topicDummy.csv') data["topic"] = None topicDummy = topicDummy.drop(topicDummy.columns[0], axis=1) print(topicDummy.columns) import numpy as np topicList = topicDummy.columns # 케라스 모델 읽기 from tensorflow import keras model = keras.models.load_model('tib_topic_model') from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences MAX_NB_WORDS = 5000 MAX_SEQUENCE_LENGTH = 500 #tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters = " ") import pickle # loading with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) for i, cont in enumerate(data["token"]): test = [] test.append(cont) seq = tokenizer.texts_to_sequences(test) padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH) pred = model.predict(padded) # cul,eco,innt,it,pol,soc,spo labels = topicList # labels = ['pol', 'eco', 'cul', 'innt', 'spo', 'soc'] data.loc[i, "topic"] = labels[np.argmax(pred)] for top in topicList: print(data[data["topic"] == top][["token", "topic"]].head(3), "\n") for topic in topicList: sumVal = (data["topic"] == topic).sum() print(topic, " count : ", sumVal) data = data.rename(columns={"token": "words"}) data = data.drop(columns=['words']) data = data.to_json(orient="records", force_ascii=False) from common.config import saveToMongo saveToMongo(data, db, "topics")
def getTfidfTable(Numdoc): import gensim from gensim import corpora from gensim.models import TfidfModel from common import prs from common import cmm import numpy as np from operator import itemgetter print(Numdoc, "개의 대량 문서 TF/IDF 분석") (docId, docTitle, tokenized_doc) = prs.readyData(Numdoc) print("형태소분석기 완료") cmm.showTime() #문서 가져옴 dct = corpora.Dictionary(tokenized_doc) #딕셔너리화 print("딕셔너리 완료") cmm.showTime() corpus = corpus = [dct.doc2bow(line) for line in tokenized_doc] tfmodel = TfidfModel(corpus) #코퍼스에 담고 tfidf모델링 sortEntire = [] print("모델링 완료") cmm.showTime() #tf 값에 따라 정렬 for id, topic_list in enumerate(tfmodel[corpus]): topic_list = sorted(topic_list, key=itemgetter(1), reverse=True) sortEntire.append((id, topic_list)) resultTF = [] #i 는 int i = 0과 같은 역할 #index 는 문서번호가 나온다. #section은 단어번호와 값의 쌍으로 나온다. for i, section in sortEntire: section = sortEntire[i] mainTF = [] #print("\nㄱ===============================ㄴ") print(i, "번째 문서의 단어 수 : ", len(section[1])) for idx, (wordId, tfValue) in enumerate(section[1]): #print ("wordID : ", wordId, " tfValue : ",tfValue) mainTF.append((dct[wordId], tfValue)) #if idx > 20: # break #print(mainTF) #print("ㄷ===============================ㄹ\n") resultTF.append({ "docID": docId[i], "docTitle": docTitle[i], "TFIDF": mainTF }) cmm.showTime() #파일로 저장 with open(DIR_EntireTfidf, 'w', -1, "utf-8") as f: json.dump(resultTF, f, ensure_ascii=False) print("테이블 생성 완료") return resultTF