예제 #1
0
def getTfidfRaw(Numdoc):
    import gensim
    from gensim import corpora
    from gensim.models import TfidfModel
    from common import prs

    print("전처리 예시")
    #(docTitle, tokenized_doc)= prs.readyData(Numdoc) ##현재 5개 문서 호출
    (docId, docTitle, tokenized_doc) = prs.readyData(Numdoc)  #로 바뀔 예정
    #분리된 데이터를 dictionary화
    dct = corpora.Dictionary(tokenized_doc)

    #코퍼스에 담음
    corpus = [dct.doc2bow(line) for line in tokenized_doc]

    #tfidf 모델에 돌림
    tfmodel = TfidfModel(corpus)

    #벡터에 담음
    vector = tfmodel[corpus[0]]

    #[(0, 0.004840388191324659), (1, 0.01275300075896571)... 의 형태
    #print(vector)

    sortTF = []
    from operator import itemgetter
    for i, topic_list in enumerate(tfmodel[corpus]):
        topic_list = sorted(topic_list, key=itemgetter(1), reverse=True)

        #print(i,'번째 문서의 TF/IDF 정렬',topic_list)
        sortTF.append((i, topic_list))

    #idf 값 깔끔하게 하는 용도
    import numpy as np

    resultTF = []
    #[n][1]을 하면 그 문서의 형태소 숫자를 알 수 있다.
    ##현재 3번째 문서는 빈문서로 되어있음  유의

    print(sortTF)
    for i, section in sortTF:
        section = sortTF[i]
        mainTF = []
        print(i, "번째 문서의 단어 수 : ", len(section[1]))

        for wordid, value in section[1]:
            #print(dct[j],"-",np.around(value, decimals=5))
            mainTF.append((dct[wordid], np.around(value, decimals=5)))
        resultTF.append((i, mainTF))

    with open(DIR_HomeGraph, 'w', -1, "utf-8") as f:
        json.dump(resultTF, f, ensure_ascii=False)

    return resultTF
예제 #2
0
def LDA(ndoc, nit=NUM_ITER, ntp=NUM_TOPICS):

    # change global value if get new params.
    global NUM_ITER
    global NUM_TOPICS

    if NUM_ITER != nit:
        NUM_ITER = nit
    if NUM_TOPICS != ntp:
        NUM_TOPICS = ntp

    print("LDA Algo 시작!")

    print(
        "##########Pahse 0 : LDA option:##########",
        "\nDOWNLOAD OPTION : ",
        str(DOWNLOAD_DATA_OPTION),
        #  "\nBACKEND CONNECTION OPTION : ", str(BACKEND_CONCT),
        #  "\nRANDOM ORDER OPTION : ", str(RANDOM_MODE)
    )

    # Phase 1 : READY DATA
    print("\n\n##########Phase 1 : READY DATA##########")
    (doc_id, titles, tokenized_doc, contents) = prs.readyData(ndoc, True)

    # LDA 알고리즘
    print("\n\n##########Phase 2 : LDA Algo##########")
    result = runLda(titles, tokenized_doc, contents)

    if DOWNLOAD_DATA_OPTION == True:
        with open(LDA_DIR_FE, 'w', -1, "utf-8") as f:
            json.dump(result, f, ensure_ascii=False)

    showTime()

    if DOWNLOAD_DATA_OPTION == True:
        print("Analysis Result has been stored at ", LDA_DIR_FE)
    print("LDA Analysis Fin!")
    return result
예제 #3
0
from common import prs
ndoc = 1000
prsResult = prs.readyData(ndoc, True)
import pandas as pd
data = pd.DataFrame(list(prsResult),
                    index=["id", "content", "token", "contents"]).T
from tensorflow import keras
model = keras.models.load_model('tib_topic_model')

for i in range(data.shape[0]):
    data.loc[i, "token"] = " ".join(data["token"][i])

topicDummy = pd.read_csv('./topicDummy.csv')
data["topic"] = None
topicDummy = topicDummy.drop(topicDummy.columns[0], axis=1)
print(topicDummy.columns)

import numpy as np
topicList = topicDummy.columns

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 500
#tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters = " ")
import pickle
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
예제 #4
0
import os
from pathlib import Path
curDir = os.getcwd()
curDir = Path(curDir)
homeDir = curDir.parent.parent

import sys
sys.path.append(str(homeDir))

from common import cmm
from common import esFunc
from common import prs

# rawCorpus = esFunc.esGetDocs(30)
data = prs.readyData(5)

import json
with open("tokened_history.json", 'w', -1, "utf-8") as f:
        json.dump(data[2], f, ensure_ascii=False)
# print(len(result[2]))
# tuple(doc id,title?,tokens)
# each element number = num of doc
# print(type(result[2][0]))
# print(len(result[2][0]))

# arr = []

# for tokenized_doc in result[2]:
    # for token in tokenized_doc:

예제 #5
0
def lstm(ndoc, db):
    #if pre process docs again...
    # True : ask ES server and preprocess again, False: load lastest prs result. prs 시간 오래 걸림
    if True:
        from pathlib import Path
        import os
        curDir = os.getcwd()
        curDir = Path(curDir)
        homeDir = curDir.parent

        import sys
        sys.path.append(str(homeDir))
        from common import prs
        prsResult = prs.readyData(ndoc)
        import pandas as pd
        data = pd.DataFrame(list(prsResult),
                            index=["docID", "docTitle", "token"]).T
        for i in range(data.shape[0]):
            data.loc[i, "token"] = " ".join(data["token"][i])

    else:
        import json
        with open('../latestPrsResult/latest_prs_result3000.json', 'r') as f:
            data = json.load(f)

        data_ = {
            "docID": data["idList"],
            "docTitle": data["titles"],
            "token": data["tokenized_doc"]
        }

        import pandas as pd
        df = pd.DataFrame.from_dict(data_)

        df_token = df.drop(df[df["token"].map(len) < 1].index)
        df_token = df_token.reset_index(drop=True)

        for i in range(len(df_token)):
            df_token.loc[i, "token"] = " ".join(df_token["token"][i])
        data = df_token
        ndoc = len(data)
        print("number of docs : " + str(ndoc))

    # 분류 학습 결과 매핑 더미
    topicDummy = pd.read_csv('./topicDummy.csv')
    data["topic"] = None
    topicDummy = topicDummy.drop(topicDummy.columns[0], axis=1)
    print(topicDummy.columns)

    import numpy as np
    topicList = topicDummy.columns

    # 케라스 모델 읽기
    from tensorflow import keras
    model = keras.models.load_model('tib_topic_model')

    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    MAX_NB_WORDS = 5000
    MAX_SEQUENCE_LENGTH = 500
    #tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters = " ")
    import pickle
    # loading
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    for i, cont in enumerate(data["token"]):
        test = []
        test.append(cont)
        seq = tokenizer.texts_to_sequences(test)
        padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
        pred = model.predict(padded)
        #   cul,eco,innt,it,pol,soc,spo
        labels = topicList
        # labels = ['pol', 'eco', 'cul', 'innt', 'spo', 'soc']
        data.loc[i, "topic"] = labels[np.argmax(pred)]

    for top in topicList:
        print(data[data["topic"] == top][["token", "topic"]].head(3), "\n")

    for topic in topicList:
        sumVal = (data["topic"] == topic).sum()
        print(topic, " count : ", sumVal)

    data = data.rename(columns={"token": "words"})

    data = data.drop(columns=['words'])

    data = data.to_json(orient="records", force_ascii=False)

    from common.config import saveToMongo
    saveToMongo(data, db, "topics")
예제 #6
0
def getTfidfTable(Numdoc):
    import gensim
    from gensim import corpora
    from gensim.models import TfidfModel
    from common import prs
    from common import cmm
    import numpy as np
    from operator import itemgetter

    print(Numdoc, "개의 대량 문서 TF/IDF 분석")
    (docId, docTitle, tokenized_doc) = prs.readyData(Numdoc)
    print("형태소분석기 완료")
    cmm.showTime()

    #문서 가져옴
    dct = corpora.Dictionary(tokenized_doc)
    #딕셔너리화
    print("딕셔너리 완료")
    cmm.showTime()

    corpus = corpus = [dct.doc2bow(line) for line in tokenized_doc]
    tfmodel = TfidfModel(corpus)
    #코퍼스에 담고 tfidf모델링
    sortEntire = []
    print("모델링 완료")
    cmm.showTime()

    #tf 값에 따라 정렬
    for id, topic_list in enumerate(tfmodel[corpus]):
        topic_list = sorted(topic_list, key=itemgetter(1), reverse=True)
        sortEntire.append((id, topic_list))

    resultTF = []
    #i 는 int i = 0과 같은 역할
    #index 는 문서번호가 나온다.
    #section은 단어번호와 값의 쌍으로 나온다.
    for i, section in sortEntire:
        section = sortEntire[i]
        mainTF = []

        #print("\nㄱ===============================ㄴ")
        print(i, "번째 문서의 단어 수 : ", len(section[1]))

        for idx, (wordId, tfValue) in enumerate(section[1]):
            #print ("wordID : ", wordId, " tfValue : ",tfValue)
            mainTF.append((dct[wordId], tfValue))
            #if idx > 20:
            #    break
        #print(mainTF)
        #print("ㄷ===============================ㄹ\n")
        resultTF.append({
            "docID": docId[i],
            "docTitle": docTitle[i],
            "TFIDF": mainTF
        })

    cmm.showTime()
    #파일로 저장
    with open(DIR_EntireTfidf, 'w', -1, "utf-8") as f:
        json.dump(resultTF, f, ensure_ascii=False)

    print("테이블 생성 완료")

    return resultTF