Beispiel #1
0
def getTrainSenteceVec(type):
    if type == TRAINTYPE:
        data = dbConnect.getData(begin=0, end=9000)
    elif type == TESTTYPE:
        data = dbConnect.getData(begin=9000, end=11000)

    return getVec(data)
Beispiel #2
0
def getLongRecord(type):
    if type == TRAINTYPE:
        longData = dbConnect.getData(begin=0, end=9000)
    elif type == TESTTYPE:
        longData = dbConnect.getData(begin=9000, end=6000)

    X_train, y_train, seq_len_train = getSplitVec(longData)
    X_train_noh, y_train_noh, seq_len_train_noh = getVec(longData)
    return X_train, y_train, seq_len_train, X_train_noh, y_train_noh, seq_len_train_noh
Beispiel #3
0
    def run(self, dispatcher: CollectingDispatcher,
            tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:

        # write the sql query here.
        query = "select * from customer"
        
        #pass the sql query to the getData method and store the results in `data` variable.
        data = getData(query)
        
        print("data: ",data)

        dispatcher.utter_message(text="Hello World!",json_message=data)

        return []
Beispiel #4
0
def buildModel():
    # 所有词集合,包括重复词
    # allSentences = []

    data = dbConnect.getData(begin=0, end=50000, type=2)

    sentences = []
    for x in data:
        # line = re.sub(r_imdb, '', str(x[1]))
        result = x[1].split(' ')
        # allSentences.extend(result)
        sentences.append(result)

    # min_count指定了需要训练词语的最小出现次数,默认为5
    # size指定了训练时词向量维度,默认为100
    # worker指定了完成训练过程的线程数,默认为1不使用多线程。只有注意安装Cython的前提下该参数设置才有意义
    model = Word2Vec(sentences, min_count=2, size=EMBEDDING_DIM)

    # 保存模型
    model.save("word2vecModel/imdb_word2vecModel")
Beispiel #5
0
def buildModel():
    # 所有词集合,包括重复词
    # allSentences = []

    data = dbConnect.getData(begin=0, end=22000)

    sentences = []
    for x in data:
        line = re.sub(r, '', str(x[1]))
        se_list = jieba.cut(line, cut_all=True)
        result = list(se_list)
        # allSentences.extend(result)
        sentences.append(result)

    # min_count指定了需要训练词语的最小出现次数,默认为5
    # size指定了训练时词向量维度,默认为100
    # worker指定了完成训练过程的线程数,默认为1不使用多线程。只有注意安装Cython的前提下该参数设置才有意义
    model = Word2Vec(sentences, min_count=1, size=EMBEDDING_DIM)

    # 保存模型
    model.save("word2vecModel/word2vecModel")
Beispiel #6
0
reload(sys)
sys.setdefaultencoding('utf-8')

# 正则过滤表达式
r = "(|)|;|、|!|,|。|\*|?|~|\<|\>|\s+"
maxSeqLength = 250

EMBEDDING_DIM = 10
TRAINTYPE = 1
TESTTYPE = 0

MAXREVLEN = 6
SENTLENGTH = 20

testdata = longData = dbConnect.getData(begin=28000, end=400)


#词向量模型训练函数
def buildModel():
    # 所有词集合,包括重复词
    # allSentences = []

    data = dbConnect.getData(begin=0, end=22000)

    sentences = []
    for x in data:
        line = re.sub(r, '', str(x[1]))
        se_list = jieba.cut(line, cut_all=True)
        result = list(se_list)
        # allSentences.extend(result)