Exemple #1
0
import pickle


def read_corpus_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]
    return data


corpus_data = read_corpus_data('./corpus.txt')

p = Preprocess()
dict = []
for c in corpus_data:
    pos = p.pos(c[1])
    for k in pos:
        dict.append(k[0])

#사전에 사용될 word2index 생성
#사전의 첫 번째 인덱스에는 OOV 사용
tokenizer = preprocessing.text.Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(dict)
word_index = tokenizer.word_index

#사전 파일 생성
f = open("chatbot_dict.bin", "wb")
try:
    pickle.dump(word_index, f)
except Exception as e:
    print(e)
Exemple #2
0
from utils.Preprocess import Preprocess
from tensorflow.keras import preprocessing

#sent = "내일 오전 10시에 짬뽕 주문하고 싶어ㅋㅋ"
sent = "내일 오전 10시에 탕수육 주문하고 싶어"
p = Preprocess(word2index_dic='../train_tools/dict/chatbot_dict.bin',
               userdic='../utils/user_dic.tsv')

pos = p.pos(sent)
keywords = p.get_keywords(pos, without_tag=False)

print(keywords)

# w2i = p.get_wordidx_sequence(keywords)
# sequences = [w2i]
#
# MAX_SEQ_LEN = 15    # 임베딩 벡터 크기
# padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
#
# print(keywords)
# print(sequences)
# print(padded_seqs)
from tensorflow.keras.utils import plot_model

# 데이터 읽어오기
train_file = "drive/MyDrive/deep-chatbot/models/intent/total_train_data.csv"
data = pd.read_csv(train_file, delimiter=',')
queries = data['query'].tolist()
intents = data['intent'].tolist()

from utils.Preprocess import Preprocess
p = Preprocess(word2index_dic=
               'drive/MyDrive/deep-chatbot/train_tools/dict/chatbot_dict.bin')

# 단어 시퀀스 생성
sequences = []
for sentence in queries:
    pos = p.pos(str(sentence))
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

# 단어 인덱스 시퀀스 벡터 ○2
# 단어 시퀀스 벡터 크기
from config.GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences,
                                                   maxlen=MAX_SEQ_LEN,
                                                   padding='post')

# (105658, 15)
print(padded_seqs.shape)
print(len(intents))  #105658
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

# 데이터 읽어오기
train_file = "total_train_data.csv"
data = pd.read_csv(train_file, delimiter=',', encoding='cp949')
queries = data['query'].tolist()
intents = data['intent'].tolist()

from utils.Preprocess import Preprocess
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')

# 단어 시퀀스 생성
sequences = []
for sentence in queries:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

# 단어 인덱스 시퀀스 벡터 ○2
# 단어 시퀀스 벡터 크기
padded_seqs = preprocessing.sequence.pad_sequences(sequences,
                                                   maxlen=15,
                                                   padding='post')

# (105658, 15)
print(padded_seqs.shape)
print(len(intents))  #105658

# 학습용, 검증용, 테스트용 데이터셋 생성 ○3
Exemple #5
0
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing

intent_labels = {0: "인사", 1: "욕설", 2: "주문", 3: "예약", 4: "기타"}

# 의도 분류 모델 불러오기
model = load_model('intent_model.h5')

query = "오늘 탕수육 주문 가능한가요?"
#query = "안녕하세요?"
from utils.Preprocess import Preprocess
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')
pos = p.pos(query)
keywords = p.get_keywords(pos, without_tag=True)
seq = p.get_wordidx_sequence(keywords)
sequences = [seq]

# 단어 시퀀스 벡터 크기
from config.GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

predict = model.predict(padded_seqs)
predict_class = tf.math.argmax(predict, axis=1)
print(query)
print("의도 예측 점수 : ", predict)
print("의도 예측 클래스 : ", predict_class.numpy())
print("의도  : ", intent_labels[predict_class.numpy()[0]])

Exemple #6
0
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing
import numpy as np

import sys

sys.path.append('/content/drive/MyDrive/deep-chatbot/')

from utils.Preprocess import Preprocess

p = Preprocess(word2index_dic=
               'drive/MyDrive/deep-chatbot/train_tools/dict/chatbot_dict.bin')

new_sentence = '데이터베이스'
pos = p.pos(new_sentence)
keywords = p.get_keywords(pos, without_tag=True)
new_seq = p.get_wordidx_sequence(keywords)

max_len = 40
new_padded_seqs = preprocessing.sequence.pad_sequences([new_seq],
                                                       padding="post",
                                                       value=0,
                                                       maxlen=max_len)
print("새로운 유형의 시퀀스 : ", new_seq)
print("새로운 유형의 시퀀스 : ", new_padded_seqs)

# NER 예측
model = load_model('drive/MyDrive/deep-chatbot/models/ner/ner_model.h5')
p = model.predict(np.array([new_padded_seqs[0]]))
p = np.argmax(p, axis=-1)  # 예측된 NER 인덱스 값 추출
from utils.Preprocess import Preprocess

sent = "김포시에 있는 고려병원 위치 알려줘"

p = Preprocess(userdic='../utils/user_dic.txt')  # 사용자 사전

pos = p.pos(sent)  # 제외시킬 품사 // 형태소단위로 끊은 형태 (품사가 아직 제외가 안됨)!
# print(pos[0])
# print(pos[1])
# print(pos[2])
# print(pos[3])
# print(pos[4])
# print(pos[5])
# print(pos[6])
# print(pos[7])
# print(pos[8])
# print(pos[9])
# print(pos)
# ret = p.get_keywords(pos, without_tag= False)
# print(pos)
# print(ret)
#
# # 태그 없이 단어만 추출
ret = p.get_keywords(pos, without_tag=True)
print(ret)