import pickle def read_corpus_data(filename): with open(filename, 'r') as f: data = [line.split('\t') for line in f.read().splitlines()] data = data[1:] return data corpus_data = read_corpus_data('./corpus.txt') p = Preprocess() dict = [] for c in corpus_data: pos = p.pos(c[1]) for k in pos: dict.append(k[0]) #사전에 사용될 word2index 생성 #사전의 첫 번째 인덱스에는 OOV 사용 tokenizer = preprocessing.text.Tokenizer(oov_token='OOV') tokenizer.fit_on_texts(dict) word_index = tokenizer.word_index #사전 파일 생성 f = open("chatbot_dict.bin", "wb") try: pickle.dump(word_index, f) except Exception as e: print(e)
from utils.Preprocess import Preprocess from tensorflow.keras import preprocessing #sent = "내일 오전 10시에 짬뽕 주문하고 싶어ㅋㅋ" sent = "내일 오전 10시에 탕수육 주문하고 싶어" p = Preprocess(word2index_dic='../train_tools/dict/chatbot_dict.bin', userdic='../utils/user_dic.tsv') pos = p.pos(sent) keywords = p.get_keywords(pos, without_tag=False) print(keywords) # w2i = p.get_wordidx_sequence(keywords) # sequences = [w2i] # # MAX_SEQ_LEN = 15 # 임베딩 벡터 크기 # padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post') # # print(keywords) # print(sequences) # print(padded_seqs)
from tensorflow.keras.utils import plot_model # 데이터 읽어오기 train_file = "drive/MyDrive/deep-chatbot/models/intent/total_train_data.csv" data = pd.read_csv(train_file, delimiter=',') queries = data['query'].tolist() intents = data['intent'].tolist() from utils.Preprocess import Preprocess p = Preprocess(word2index_dic= 'drive/MyDrive/deep-chatbot/train_tools/dict/chatbot_dict.bin') # 단어 시퀀스 생성 sequences = [] for sentence in queries: pos = p.pos(str(sentence)) keywords = p.get_keywords(pos, without_tag=True) seq = p.get_wordidx_sequence(keywords) sequences.append(seq) # 단어 인덱스 시퀀스 벡터 ○2 # 단어 시퀀스 벡터 크기 from config.GlobalParams import MAX_SEQ_LEN padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post') # (105658, 15) print(padded_seqs.shape) print(len(intents)) #105658
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate # 데이터 읽어오기 train_file = "total_train_data.csv" data = pd.read_csv(train_file, delimiter=',', encoding='cp949') queries = data['query'].tolist() intents = data['intent'].tolist() from utils.Preprocess import Preprocess p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin', userdic='../../utils/user_dic.tsv') # 단어 시퀀스 생성 sequences = [] for sentence in queries: pos = p.pos(sentence) keywords = p.get_keywords(pos, without_tag=True) seq = p.get_wordidx_sequence(keywords) sequences.append(seq) # 단어 인덱스 시퀀스 벡터 ○2 # 단어 시퀀스 벡터 크기 padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=15, padding='post') # (105658, 15) print(padded_seqs.shape) print(len(intents)) #105658 # 학습용, 검증용, 테스트용 데이터셋 생성 ○3
import tensorflow as tf from tensorflow.keras.models import Model, load_model from tensorflow.keras import preprocessing intent_labels = {0: "인사", 1: "욕설", 2: "주문", 3: "예약", 4: "기타"} # 의도 분류 모델 불러오기 model = load_model('intent_model.h5') query = "오늘 탕수육 주문 가능한가요?" #query = "안녕하세요?" from utils.Preprocess import Preprocess p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin', userdic='../../utils/user_dic.tsv') pos = p.pos(query) keywords = p.get_keywords(pos, without_tag=True) seq = p.get_wordidx_sequence(keywords) sequences = [seq] # 단어 시퀀스 벡터 크기 from config.GlobalParams import MAX_SEQ_LEN padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post') predict = model.predict(padded_seqs) predict_class = tf.math.argmax(predict, axis=1) print(query) print("의도 예측 점수 : ", predict) print("의도 예측 클래스 : ", predict_class.numpy()) print("의도 : ", intent_labels[predict_class.numpy()[0]])
from tensorflow.keras.models import Model, load_model from tensorflow.keras import preprocessing import numpy as np import sys sys.path.append('/content/drive/MyDrive/deep-chatbot/') from utils.Preprocess import Preprocess p = Preprocess(word2index_dic= 'drive/MyDrive/deep-chatbot/train_tools/dict/chatbot_dict.bin') new_sentence = '데이터베이스' pos = p.pos(new_sentence) keywords = p.get_keywords(pos, without_tag=True) new_seq = p.get_wordidx_sequence(keywords) max_len = 40 new_padded_seqs = preprocessing.sequence.pad_sequences([new_seq], padding="post", value=0, maxlen=max_len) print("새로운 유형의 시퀀스 : ", new_seq) print("새로운 유형의 시퀀스 : ", new_padded_seqs) # NER 예측 model = load_model('drive/MyDrive/deep-chatbot/models/ner/ner_model.h5') p = model.predict(np.array([new_padded_seqs[0]])) p = np.argmax(p, axis=-1) # 예측된 NER 인덱스 값 추출
from utils.Preprocess import Preprocess sent = "김포시에 있는 고려병원 위치 알려줘" p = Preprocess(userdic='../utils/user_dic.txt') # 사용자 사전 pos = p.pos(sent) # 제외시킬 품사 // 형태소단위로 끊은 형태 (품사가 아직 제외가 안됨)! # print(pos[0]) # print(pos[1]) # print(pos[2]) # print(pos[3]) # print(pos[4]) # print(pos[5]) # print(pos[6]) # print(pos[7]) # print(pos[8]) # print(pos[9]) # print(pos) # ret = p.get_keywords(pos, without_tag= False) # print(pos) # print(ret) # # # 태그 없이 단어만 추출 ret = p.get_keywords(pos, without_tag=True) print(ret)