Ejemplo n.º 1
0
Archivo: mlData.py Proyecto: MiniBee/dl
 def get_features(self, data, method='word2vec'):
     if method == 'tfidf':
         data = [' '.join(query) for query in data['queryCutRMStopWord']]
         return self.em.tfidf.transform(data)
     elif method == 'word2vec':
         return np.vstack(data['queryCutRMStopWord'].apply(
             lambda x: wam(x, self.em.w2v)[0]))
     elif method == 'fasttext':
         return np.vstack(data['queryCutRMStopWord'].apply(
             lambda x: wam(x, self.em.fast)[0]))
     else:
         NotImplementedError
Ejemplo n.º 2
0
def get_embedding_feature(mldata):
    '''
    @description: get_embedding_feature, tfidf, word2vec -> max/mean, word2vec n-gram(2, 3, 4) -> max/mean, label embedding->max/mean
    @param {type}
    mldata, input data set, mldata class instance
    @return:
    train_tfidf, tfidf of train data set
    test_tfidf, tfidf of test data set
    train, train data set
    test, test data set
    '''
    mldata.train["queryCutRMStopWords"] = mldata.train[
        "queryCutRMStopWord"].apply(lambda x: " ".join(x))
    mldata.dev["queryCutRMStopWords"] = mldata.dev["queryCutRMStopWord"].apply(
        lambda x: " ".join(x))
    train_tfidf = pd.DataFrame(
        mldata.em.tfidf.transform(
            mldata.train["queryCutRMStopWords"].tolist()).toarray())
    train_tfidf.columns = [
        'tfidf' + str(i) for i in range(train_tfidf.shape[1])
    ]
    test_tfidf = pd.DataFrame(
        mldata.em.tfidf.transform(
            mldata.dev["queryCutRMStopWords"].tolist()).toarray())
    test_tfidf.columns = [
        'tfidf' + str(i) for i in range(train_tfidf.shape[1])
    ]

    print("transform w2v")
    mldata.train['w2v'] = mldata.train["queryCutRMStopWord"].apply(
        lambda x: wam(x, mldata.em.w2v, aggregate=False))
    mldata.dev['w2v'] = mldata.dev["queryCutRMStopWord"].apply(
        lambda x: wam(x, mldata.em.w2v, aggregate=False))

    train = copy.deepcopy(
        mldata.train)  ########################################################
    test = copy.deepcopy(mldata.dev)
    labelNameToIndex = json.load(open(config.root_path +
                                      '/data/label2id.json'))
    labelIndexToName = {v: k for k, v in labelNameToIndex.items()}
    w2v_label_embedding = np.array([
        mldata.em.w2v.wv.get_vector(labelIndexToName[key])
        for key in labelIndexToName
        if labelIndexToName[key] in mldata.em.w2v.wv.vocab.keys()
    ])

    joblib.dump(w2v_label_embedding,
                config.root_path + '/data/w2v_label_embedding.pkl')
    train = generate_feature(train, w2v_label_embedding, model_name='w2v')
    test = generate_feature(test, w2v_label_embedding, model_name='w2v')
    return train_tfidf, test_tfidf, train, test
Ejemplo n.º 3
0
 def get_feature(self, data, method='word2vec'):
     '''
     @description: generate feature
     @param {type}
     data, input dataset
     method: three options, word2vec, fasttext, tfidf
     @return: coresponding feature
     '''
     if method == 'tfidf':
         data = [' '.join(query) for query in data["queryCutRMStopWord"]]
         return self.em.tfidf.transform(data)
     elif method == 'word2vec':
         # return [np.array(wam(x, self.em.w2v)) for x in data['text'].values.tolist()]
         return np.vstack(data['queryCutRMStopWord'].apply(
             lambda x: wam(x, self.em.w2v)[0]))
     elif method == 'fasttext':
         return np.vstack(data['queryCutRMStopWord'].apply(
             lambda x: wam(x, self.em.fast)[0]))
     else:
         NotImplementedError
Ejemplo n.º 4
0
def get_embedding_features(data, tfidf, embedding_model):
    data['queryCutRMStopWords'] = data['queryCutRMStopWord'].apply(lambda x: ' '.join(x))
    tfidf_data = pd.DataFrame(tfidf.transform(data['queryCutRMStopWords'].tolist()).toarray())
    tfidf_data.columns = ['tfidf' + str(i) for i in range(tfidf_data.shape[1])]
    data['w2v'] = data['queryCutRMStopWord'].apply(lambda x: wam(x, embedding_model, aggregate=False))

    train = copy.deepcopy(data)
    labelNameToIndex = json.load(open(config.root_path + '/data/label2id.json', encoding='utf-8'))
    labelIndexToName = {v: k for k, v in labelNameToIndex.items()}
    w2v_label_embedding = np.array(embedding_model.wv.get_vector(labelIndexToName[key]) for key in labelIndexToName if labelIndexToName[key] in embedding_model.wv.vocab.keys())

    joblib.dump(w2v_label_embedding, config.root_path + '/data/w2v_label_embedding.pkl')
    train = generate_feature(train, w2v_label_embedding, model_name='w2v')
    return tfidf_data, train
Ejemplo n.º 5
0
def get_embedding_feature(data, tfidf, embedding_model):
    '''
    @description: get_embedding_feature, tfidf, word2vec -> max/mean, word2vec n-gram(2, 3, 4) -> max/mean, label embedding->max/mean
    @param {type}
    mldata, input data set, mldata class instance
    @return:
    train_tfidf, tfidf of train data set
    test_tfidf, tfidf of test data set
    train, train data set
    test, test data set
    '''
    # 根据过滤停止词后的数据, 获取tfidf 特征
    data["queryCutRMStopWords"] = data["queryCutRMStopWord"].apply(
        lambda x: " ".join(x))
    tfidf_data = pd.DataFrame(
        tfidf.transform(data["queryCutRMStopWords"].tolist()).toarray())
    tfidf_data.columns = ['tfidf' + str(i) for i in range(tfidf_data.shape[1])]

    print("transform w2v")
    # 同上, 获取embedding 特征, 不进行聚合
    data['w2v'] = data["queryCutRMStopWord"].apply(
        lambda x: wam(x, embedding_model, aggregate=False))  # [seq_len * 300]

    # 深度拷贝数据
    train = copy.deepcopy(data)
    # 加载所有类别, 获取类别的embedding, 并保存文件
    labelNameToIndex = json.load(
        open(config.root_path + '/data/label2id.json', encoding='utf-8'))
    labelIndexToName = {v: k for k, v in labelNameToIndex.items()}
    w2v_label_embedding = np.array([
        embedding_model.wv.get_vector(labelIndexToName[key])
        for key in labelIndexToName
        if labelIndexToName[key] in embedding_model.wv.vocab.keys()
    ])

    joblib.dump(w2v_label_embedding,
                config.root_path + '/data/w2v_label_embedding.pkl')
    # 根据未聚合的embedding 数据, 获取各类embedding 特征
    train = generate_feature(train, w2v_label_embedding, model_name='w2v')
    return tfidf_data, train