def feature_engineer(self): logger.info('generate embedding feature .. ') train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.em.tfidf, self.ml_data.em.w2v) test_tfidf, test = get_embedding_feature(self.ml_data.test, self.ml_data.em.tfidf, self.ml_data.em.w2v) train = formate_data(train, train_tfidf, train_ae) test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test
def process(self, title, desc): ########################################### # TODO: module 5 task 1.1 # ########################################### # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in self.ml_data.em.stopWords]) df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf, self.ml_data.em.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), df.bow)) print("generate autoencoder feature ") df_ae = get_autoencoder_feature(df, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train
def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合 ########################################### # TODO: module 3 task 1.1 # ########################################### train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.em.tfidf, self.ml_data.em.w2v) test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.em.tfidf, self.ml_data.em.w2v) logger.info("generate autoencoder feature ") # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder train_ae = get_autoencoder_feature( train, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) test_ae = get_autoencoder_feature( test, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) logger.info("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) logger.info("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.root_path + '/data/book_cover/') # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding ########################################### # TODO: module 3 task 1.2 # ########################################### train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) logger.info("generate bert feature ") ########################################### # TODO: module 3 task 1.3 # ########################################### train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) logger.info("generate lda feature ") ########################################### # TODO: module 3 task 1.4 # ########################################### # 生成bag of word格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) logger.info("formate data") # 将所有的特征拼接到一起 train = formate_data(train, train_tfidf, train_ae) test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test
def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") train_tfidf, test_tfidf, train, test = get_embedding_feature( self.ml_data) logger.info("generate basic feature ") # 1. 获取 基本的 NLP feature train = get_basic_feature(train) test = get_basic_feature(test) print(test.loc[0]) logger.info("generate modal feature ") cover = os.listdir(config.root_path + '/data/book_cover/') train['cover'] = train.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 1. 获取 三大CV模型的 modal embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) print(len(test.loc[0, 'res_embedding'])) #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) logger.info("generate bert feature ") # 1. 获取bert embedding train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print(test.loc[0]) logger.info("generate lda feature ") # 1. 获取 lda feature train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) print(test['queryCutRMStopWord']) print(test['bow']) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) print(test['lda']) print(test.loc[0]) logger.info("formate data") print(test) print(test_tfidf) train, test = formate_data(train, test, train_tfidf, test_tfidf) print(test) print(test.loc[0]) cols = [x for x in train.columns if str(x) not in ['labelIndex']] print(cols) X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] print(y_test) return X_train, X_test, y_train, y_test