def get_word2vec_matrix(path): word2vecExt_obj = Word2vecExt() word2vecExt_obj.load(path) wv = word2vecExt_obj.model.wv vocab = Vocabulary() vocab.fit(wv.index2word) matrix = np.concatenate((np.zeros((2, wv.vectors.shape[1])), wv.vectors), axis=0) return vocab, matrix
def load(path, MAX_LEN): s_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"]) t_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"]) with open(path, 'r') as f: data = f.readlines() size = int(len(data) / 3) source_seq = [] target_seq = [] for i in range(size): s_i = re.sub(" ", "", data[i * 3 + 1].strip("\n").lower()[2:]) t_i = re.sub(" ", "", data[i * 3 + 2].strip("\n").lower()[2:]) if len(s_i) == 0 or len(t_i) == 0 or len(t_i) > MAX_LEN or len(s_i) > MAX_LEN: continue source_seq.append(s_i) target_seq.append(t_i) source_seq = cut(source_seq, 4) target_seq = cut(target_seq, 4) for i in range(len(target_seq)): target_seq[i].append("<eos>") s_vocab.fit(source_seq) t_vocab.fit(target_seq) return source_seq, s_vocab, target_seq, t_vocab
# p = ROOT_PATH.parent / "corpus" / "intent" / "fastText" # x, y = read_fasttext_file(str(p / "amazon.txt")) # train_x, train_y = x[:7000], y[:7000] # test_x, test_y = x[7000:], y[7000:] import pandas as pd p = ROOT_PATH.parent / "corpus" / "intent" c1 = pd.read_excel(str(p / "intent_1.xlsx"))[["text", "intent"]] c2 = pd.read_excel(str(p / "intent_2.xlsx")) corpus_train = pd.concat([c1, c2]).reset_index(drop=True) corpus_test = pd.read_excel(str(p / "intent_valid.xlsx")) train_x = cut(corpus_train.text.tolist()) train_y = corpus_train.intent.tolist() test_x = cut(corpus_test.text.tolist()) test_y = corpus_test.intent.tolist() vocab = Vocabulary() vocab.fit(train_x) label = IntentLabel() label.fit(train_y) train_x = np.array(vocab.transform(train_x, max_length=10)) test_x = np.array(vocab.transform(test_x, max_length=10)) train_y = np.array(label.transform(train_y)) test_y = np.array(label.transform(test_y)) p = { "vocab_size": len(vocab), "embed_dim": 40, "class_num": len(label), "lr": 0.01, # "dropout": 0.5, }
if __name__ == "__main__": import numpy as np from chatbot.utils.path import ROOT_PATH, MODEL_PATH from chatbot.utils.data import read_fasttext_file from chatbot.cparse.vocabulary import Vocabulary from chatbot.cparse.label import IntentLabel p = ROOT_PATH.parent / "corpus" / "intent" / "fastText" x, y = read_fasttext_file(str(p / "amazon.txt")) train_x, train_y = x[:7000], y[:7000] test_x, test_y = x[7000:], y[7000:] # train_x, train_y = read_fasttext_file(str(p/"demo.train.txt")) # test_x, test_y = read_fasttext_file(str(p / "demo.train.txt")) vocab = Vocabulary() vocab.fit(train_x) label = IntentLabel() label.fit(train_y) train_x = np.array(vocab.transform(train_x, max_length=50)) test_x = np.array(vocab.transform(test_x, max_length=50)) train_y = np.array(label.transform(train_y)) test_y = np.array(label.transform(test_y)) p = { "vocab_size": len(vocab), "embed_dim": 60, "class_num": len(label), "lr": 0.01, # "dropout": 0.5, }
p = ROOT_PATH.parent / "corpus" / "intent" c1 = pd.read_excel(str(p / "intent_1.xlsx"))[["text", "intent"]] c2 = pd.read_excel(str(p / "intent_2.xlsx")) corpus_train = pd.concat([c1, c2]).reset_index(drop=True) corpus_test = pd.read_excel(str(p / "intent_valid.xlsx")) train_x = cut(corpus_train.text.tolist()) train_y = corpus_train.intent.tolist() test_x = cut(corpus_test.text.tolist()) test_y = corpus_test.intent.tolist() from chatbot.preprocessing.word2vec import get_word2vec_matrix # vocab, w2v = get_word2vec_matrix( # str(ROOT_PATH.parent/"corpus"/"word2vec"/"wiki_default") # ) vocab = Vocabulary() vocab.fit(train_x) label = IntentLabel() # label.init_from_config("intent.v0.2.cfg") label.fit(train_y) train_x = np.array(vocab.transform(train_x, max_length=10)) test_x = np.array(vocab.transform(test_x, max_length=10)) train_y = np.array(label.transform(train_y)) test_y = np.array(label.transform(test_y)) fasttext_param = { "vocab_size": len(vocab), # "embed_dim": w2v.shape[1], "embed_dim": 40, "class_num": len(label), "lr": 0.01,
if __name__ == "__main__": import numpy as np from chatbot.utils.path import ROOT_PATH, MODEL_PATH from chatbot.utils.data import read_fasttext_file from chatbot.cparse.vocabulary import Vocabulary from chatbot.cparse.label import IntentLabel p = ROOT_PATH.parent / "corpus" / "intent" / "fastText" x, y = read_fasttext_file(str(p / "corpus")) train_x, train_y = x[:7000], y[:7000] test_x, test_y = x[7000:], y[7000:] import copy x = copy.deepcopy(train_x) test_x, test_y = read_fasttext_file(str(p / "demo.train.txt")) vocab = Vocabulary() vocab.fit(train_x) label = IntentLabel() label.fit(train_y) train_x = np.array(vocab.transform(train_x, max_length=10)) test_x = np.array(vocab.transform(test_x, max_length=10)) train_y = np.array(label.transform(train_y)) test_y = np.array(label.transform(test_y)) param = { "vocab_size": len(vocab), "embed_dim": 60, "class_num": len(label), "lr": 0.01, "hidden_dim": 10, # "dropout": 0.5,
from chatbot.skills.botQA import BotQA from chatbot.skills.data_query import TestDataQuery, DataInquiry from chatbot.skills.help import Help from chatbot.skills.safe import SafeResponse from chatbot.skills.tuling import Tuling from chatbot.skills.file_retrieval import FileRetrievalExt from chatbot.bot import ChatBot from chatbot.utils.path import MODEL_PATH from wxpy import Bot intent_model = FastText.load( str(MODEL_PATH / "v0.21" / "intent_model.FastText")) intent_rule = IntentRuleV1() ner = NerRuleV1() label = IntentLabel.load(str(MODEL_PATH / "v0.21" / "label")) vocab = Vocabulary.load(str(MODEL_PATH / "v0.21" / "vocab")) file_retrieval = FileRetrievalExt( str(MODEL_PATH / "v0.2" / "file_retrieval" / "test"), str(MODEL_PATH / "v0.2" / "file_retrieval" / "policy_filev3.utf8.csv")) cbot = ChatBot(intent_model=intent_model, intent_rule=intent_rule, vocab=vocab, label=label, ner=ner, intent2skills={ "未知": SafeResponse(), "留言": LeaveMessage(), "帮助": Help(), "闲聊": Tuling(), "文件检索": file_retrieval,
logit = F.log_softmax(self.fc1(x), dim=1) # logit = F.softmax(self.fc1(x), dim=1) # logit = self.fc1(x) return logit if __name__ == "__main__": from chatbot.utils.data import read_fasttext_file from chatbot.cparse.vocabulary import Vocabulary from chatbot.cparse.label import IntentLabel from chatbot.utils.path import ROOT_PATH import numpy as np p = ROOT_PATH.parent / "corpus" / "intent" / "fastText" train_x, train_y = read_fasttext_file(str(p / "demo.train.txt")) test_x, test_y = read_fasttext_file(str(p / "demo.train.txt")) vocab = Vocabulary() vocab.fit(train_x) label = IntentLabel() label.fit(train_y) train_x = np.array(vocab.transform(train_x, max_length=10)) test_x = np.array(vocab.transform(test_x, max_length=10)) train_y = np.array(label.transform(train_y)) test_y = np.array(label.transform(test_y)) textCNN_param = { "vocab_size": len(vocab), "embed_dim": 60, "class_num": len(label), "kernel_num": 16, "kernel_size": [3, 4, 5], "dropout": 0.5,