Example #1
0
def get_word2vec_matrix(path):
    word2vecExt_obj = Word2vecExt()
    word2vecExt_obj.load(path)
    wv = word2vecExt_obj.model.wv
    vocab = Vocabulary()
    vocab.fit(wv.index2word)
    matrix = np.concatenate((np.zeros((2, wv.vectors.shape[1])), wv.vectors),
                            axis=0)
    return vocab, matrix
Example #2
0
def load(path, MAX_LEN):
    s_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"])
    t_vocab = Vocabulary(init_vocablary=["<sos>", "<eos>"])
    with open(path, 'r') as f:
        data = f.readlines()
    size = int(len(data) / 3)
    source_seq = []
    target_seq = []
    for i in range(size):
        s_i = re.sub(" ", "", data[i * 3 + 1].strip("\n").lower()[2:])
        t_i = re.sub(" ", "", data[i * 3 + 2].strip("\n").lower()[2:])
        if len(s_i) == 0 or len(t_i) == 0 or len(t_i) > MAX_LEN or len(s_i) > MAX_LEN:
            continue
        source_seq.append(s_i)
        target_seq.append(t_i)
    source_seq = cut(source_seq, 4)
    target_seq = cut(target_seq, 4)
    for i in range(len(target_seq)):
        target_seq[i].append("<eos>")
    s_vocab.fit(source_seq)
    t_vocab.fit(target_seq)
    return source_seq, s_vocab, target_seq, t_vocab
Example #3
0
    # p = ROOT_PATH.parent / "corpus" / "intent" / "fastText"
    # x, y = read_fasttext_file(str(p / "amazon.txt"))
    # train_x, train_y = x[:7000], y[:7000]
    # test_x, test_y = x[7000:], y[7000:]
    import pandas as pd
    p = ROOT_PATH.parent / "corpus" / "intent"
    c1 = pd.read_excel(str(p / "intent_1.xlsx"))[["text", "intent"]]
    c2 = pd.read_excel(str(p / "intent_2.xlsx"))
    corpus_train = pd.concat([c1, c2]).reset_index(drop=True)
    corpus_test = pd.read_excel(str(p / "intent_valid.xlsx"))
    train_x = cut(corpus_train.text.tolist())
    train_y = corpus_train.intent.tolist()
    test_x = cut(corpus_test.text.tolist())
    test_y = corpus_test.intent.tolist()

    vocab = Vocabulary()
    vocab.fit(train_x)
    label = IntentLabel()
    label.fit(train_y)
    train_x = np.array(vocab.transform(train_x, max_length=10))
    test_x = np.array(vocab.transform(test_x, max_length=10))
    train_y = np.array(label.transform(train_y))
    test_y = np.array(label.transform(test_y))

    p = {
        "vocab_size": len(vocab),
        "embed_dim": 40,
        "class_num": len(label),
        "lr": 0.01,
        # "dropout": 0.5,
    }
Example #4
0
if __name__ == "__main__":
    import numpy as np
    from chatbot.utils.path import ROOT_PATH, MODEL_PATH
    from chatbot.utils.data import read_fasttext_file
    from chatbot.cparse.vocabulary import Vocabulary
    from chatbot.cparse.label import IntentLabel

    p = ROOT_PATH.parent / "corpus" / "intent" / "fastText"
    x, y = read_fasttext_file(str(p / "amazon.txt"))
    train_x, train_y = x[:7000], y[:7000]
    test_x, test_y = x[7000:], y[7000:]

    # train_x, train_y = read_fasttext_file(str(p/"demo.train.txt"))
    # test_x, test_y = read_fasttext_file(str(p / "demo.train.txt"))

    vocab = Vocabulary()
    vocab.fit(train_x)
    label = IntentLabel()
    label.fit(train_y)
    train_x = np.array(vocab.transform(train_x, max_length=50))
    test_x = np.array(vocab.transform(test_x, max_length=50))
    train_y = np.array(label.transform(train_y))
    test_y = np.array(label.transform(test_y))

    p = {
        "vocab_size": len(vocab),
        "embed_dim": 60,
        "class_num": len(label),
        "lr": 0.01,
        # "dropout": 0.5,
    }
Example #5
0
    p = ROOT_PATH.parent / "corpus" / "intent"
    c1 = pd.read_excel(str(p / "intent_1.xlsx"))[["text", "intent"]]
    c2 = pd.read_excel(str(p / "intent_2.xlsx"))
    corpus_train = pd.concat([c1, c2]).reset_index(drop=True)
    corpus_test = pd.read_excel(str(p / "intent_valid.xlsx"))
    train_x = cut(corpus_train.text.tolist())
    train_y = corpus_train.intent.tolist()
    test_x = cut(corpus_test.text.tolist())
    test_y = corpus_test.intent.tolist()

    from chatbot.preprocessing.word2vec import get_word2vec_matrix
    # vocab, w2v = get_word2vec_matrix(
    #     str(ROOT_PATH.parent/"corpus"/"word2vec"/"wiki_default")
    # )

    vocab = Vocabulary()
    vocab.fit(train_x)
    label = IntentLabel()
    # label.init_from_config("intent.v0.2.cfg")
    label.fit(train_y)
    train_x = np.array(vocab.transform(train_x, max_length=10))
    test_x = np.array(vocab.transform(test_x, max_length=10))
    train_y = np.array(label.transform(train_y))
    test_y = np.array(label.transform(test_y))

    fasttext_param = {
        "vocab_size": len(vocab),
        # "embed_dim": w2v.shape[1],
        "embed_dim": 40,
        "class_num": len(label),
        "lr": 0.01,
Example #6
0
if __name__ == "__main__":
    import numpy as np
    from chatbot.utils.path import ROOT_PATH, MODEL_PATH
    from chatbot.utils.data import read_fasttext_file
    from chatbot.cparse.vocabulary import Vocabulary
    from chatbot.cparse.label import IntentLabel

    p = ROOT_PATH.parent / "corpus" / "intent" / "fastText"
    x, y = read_fasttext_file(str(p / "corpus"))
    train_x, train_y = x[:7000], y[:7000]
    test_x, test_y = x[7000:], y[7000:]
    import copy
    x = copy.deepcopy(train_x)
    test_x, test_y = read_fasttext_file(str(p / "demo.train.txt"))
    vocab = Vocabulary()
    vocab.fit(train_x)
    label = IntentLabel()
    label.fit(train_y)
    train_x = np.array(vocab.transform(train_x, max_length=10))
    test_x = np.array(vocab.transform(test_x, max_length=10))
    train_y = np.array(label.transform(train_y))
    test_y = np.array(label.transform(test_y))

    param = {
        "vocab_size": len(vocab),
        "embed_dim": 60,
        "class_num": len(label),
        "lr": 0.01,
        "hidden_dim": 10,
        # "dropout": 0.5,
Example #7
0
from chatbot.skills.botQA import BotQA
from chatbot.skills.data_query import TestDataQuery, DataInquiry
from chatbot.skills.help import Help
from chatbot.skills.safe import SafeResponse
from chatbot.skills.tuling import Tuling
from chatbot.skills.file_retrieval import FileRetrievalExt
from chatbot.bot import ChatBot
from chatbot.utils.path import MODEL_PATH
from wxpy import Bot

intent_model = FastText.load(
    str(MODEL_PATH / "v0.21" / "intent_model.FastText"))
intent_rule = IntentRuleV1()
ner = NerRuleV1()
label = IntentLabel.load(str(MODEL_PATH / "v0.21" / "label"))
vocab = Vocabulary.load(str(MODEL_PATH / "v0.21" / "vocab"))
file_retrieval = FileRetrievalExt(
    str(MODEL_PATH / "v0.2" / "file_retrieval" / "test"),
    str(MODEL_PATH / "v0.2" / "file_retrieval" / "policy_filev3.utf8.csv"))

cbot = ChatBot(intent_model=intent_model,
               intent_rule=intent_rule,
               vocab=vocab,
               label=label,
               ner=ner,
               intent2skills={
                   "未知": SafeResponse(),
                   "留言": LeaveMessage(),
                   "帮助": Help(),
                   "闲聊": Tuling(),
                   "文件检索": file_retrieval,
Example #8
0
        logit = F.log_softmax(self.fc1(x), dim=1)
        # logit = F.softmax(self.fc1(x), dim=1)
        # logit = self.fc1(x)
        return logit


if __name__ == "__main__":
    from chatbot.utils.data import read_fasttext_file
    from chatbot.cparse.vocabulary import Vocabulary
    from chatbot.cparse.label import IntentLabel
    from chatbot.utils.path import ROOT_PATH
    import numpy as np
    p = ROOT_PATH.parent / "corpus" / "intent" / "fastText"
    train_x, train_y = read_fasttext_file(str(p / "demo.train.txt"))
    test_x, test_y = read_fasttext_file(str(p / "demo.train.txt"))
    vocab = Vocabulary()
    vocab.fit(train_x)
    label = IntentLabel()
    label.fit(train_y)
    train_x = np.array(vocab.transform(train_x, max_length=10))
    test_x = np.array(vocab.transform(test_x, max_length=10))
    train_y = np.array(label.transform(train_y))
    test_y = np.array(label.transform(test_y))

    textCNN_param = {
        "vocab_size": len(vocab),
        "embed_dim": 60,
        "class_num": len(label),
        "kernel_num": 16,
        "kernel_size": [3, 4, 5],
        "dropout": 0.5,