def get_features(self, data, method='word2vec'): if method == 'tfidf': data = [' '.join(query) for query in data['queryCutRMStopWord']] return self.em.tfidf.transform(data) elif method == 'word2vec': return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.w2v)[0])) elif method == 'fasttext': return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.fast)[0])) else: NotImplementedError
def get_embedding_feature(mldata): ''' @description: get_embedding_feature, tfidf, word2vec -> max/mean, word2vec n-gram(2, 3, 4) -> max/mean, label embedding->max/mean @param {type} mldata, input data set, mldata class instance @return: train_tfidf, tfidf of train data set test_tfidf, tfidf of test data set train, train data set test, test data set ''' mldata.train["queryCutRMStopWords"] = mldata.train[ "queryCutRMStopWord"].apply(lambda x: " ".join(x)) mldata.dev["queryCutRMStopWords"] = mldata.dev["queryCutRMStopWord"].apply( lambda x: " ".join(x)) train_tfidf = pd.DataFrame( mldata.em.tfidf.transform( mldata.train["queryCutRMStopWords"].tolist()).toarray()) train_tfidf.columns = [ 'tfidf' + str(i) for i in range(train_tfidf.shape[1]) ] test_tfidf = pd.DataFrame( mldata.em.tfidf.transform( mldata.dev["queryCutRMStopWords"].tolist()).toarray()) test_tfidf.columns = [ 'tfidf' + str(i) for i in range(train_tfidf.shape[1]) ] print("transform w2v") mldata.train['w2v'] = mldata.train["queryCutRMStopWord"].apply( lambda x: wam(x, mldata.em.w2v, aggregate=False)) mldata.dev['w2v'] = mldata.dev["queryCutRMStopWord"].apply( lambda x: wam(x, mldata.em.w2v, aggregate=False)) train = copy.deepcopy( mldata.train) ######################################################## test = copy.deepcopy(mldata.dev) labelNameToIndex = json.load(open(config.root_path + '/data/label2id.json')) labelIndexToName = {v: k for k, v in labelNameToIndex.items()} w2v_label_embedding = np.array([ mldata.em.w2v.wv.get_vector(labelIndexToName[key]) for key in labelIndexToName if labelIndexToName[key] in mldata.em.w2v.wv.vocab.keys() ]) joblib.dump(w2v_label_embedding, config.root_path + '/data/w2v_label_embedding.pkl') train = generate_feature(train, w2v_label_embedding, model_name='w2v') test = generate_feature(test, w2v_label_embedding, model_name='w2v') return train_tfidf, test_tfidf, train, test
def get_feature(self, data, method='word2vec'): ''' @description: generate feature @param {type} data, input dataset method: three options, word2vec, fasttext, tfidf @return: coresponding feature ''' if method == 'tfidf': data = [' '.join(query) for query in data["queryCutRMStopWord"]] return self.em.tfidf.transform(data) elif method == 'word2vec': # return [np.array(wam(x, self.em.w2v)) for x in data['text'].values.tolist()] return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.w2v)[0])) elif method == 'fasttext': return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.fast)[0])) else: NotImplementedError
def get_embedding_features(data, tfidf, embedding_model): data['queryCutRMStopWords'] = data['queryCutRMStopWord'].apply(lambda x: ' '.join(x)) tfidf_data = pd.DataFrame(tfidf.transform(data['queryCutRMStopWords'].tolist()).toarray()) tfidf_data.columns = ['tfidf' + str(i) for i in range(tfidf_data.shape[1])] data['w2v'] = data['queryCutRMStopWord'].apply(lambda x: wam(x, embedding_model, aggregate=False)) train = copy.deepcopy(data) labelNameToIndex = json.load(open(config.root_path + '/data/label2id.json', encoding='utf-8')) labelIndexToName = {v: k for k, v in labelNameToIndex.items()} w2v_label_embedding = np.array(embedding_model.wv.get_vector(labelIndexToName[key]) for key in labelIndexToName if labelIndexToName[key] in embedding_model.wv.vocab.keys()) joblib.dump(w2v_label_embedding, config.root_path + '/data/w2v_label_embedding.pkl') train = generate_feature(train, w2v_label_embedding, model_name='w2v') return tfidf_data, train
def get_embedding_feature(data, tfidf, embedding_model): ''' @description: get_embedding_feature, tfidf, word2vec -> max/mean, word2vec n-gram(2, 3, 4) -> max/mean, label embedding->max/mean @param {type} mldata, input data set, mldata class instance @return: train_tfidf, tfidf of train data set test_tfidf, tfidf of test data set train, train data set test, test data set ''' # 根据过滤停止词后的数据, 获取tfidf 特征 data["queryCutRMStopWords"] = data["queryCutRMStopWord"].apply( lambda x: " ".join(x)) tfidf_data = pd.DataFrame( tfidf.transform(data["queryCutRMStopWords"].tolist()).toarray()) tfidf_data.columns = ['tfidf' + str(i) for i in range(tfidf_data.shape[1])] print("transform w2v") # 同上, 获取embedding 特征, 不进行聚合 data['w2v'] = data["queryCutRMStopWord"].apply( lambda x: wam(x, embedding_model, aggregate=False)) # [seq_len * 300] # 深度拷贝数据 train = copy.deepcopy(data) # 加载所有类别, 获取类别的embedding, 并保存文件 labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) labelIndexToName = {v: k for k, v in labelNameToIndex.items()} w2v_label_embedding = np.array([ embedding_model.wv.get_vector(labelIndexToName[key]) for key in labelIndexToName if labelIndexToName[key] in embedding_model.wv.vocab.keys() ]) joblib.dump(w2v_label_embedding, config.root_path + '/data/w2v_label_embedding.pkl') # 根据未聚合的embedding 数据, 获取各类embedding 特征 train = generate_feature(train, w2v_label_embedding, model_name='w2v') return tfidf_data, train