Esempio n. 1
0
    def process(self, title, desc):
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x:
            [word for word in x if word not in self.ml_data.em.stopWords])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf,
                                             self.ml_data.em.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''
        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                df.bow))

        print("generate autoencoder feature ")
        df_ae = get_autoencoder_feature(df,
                                        self.ml_data.em.ae.max_features,
                                        self.ml_data.em.ae.max_len,
                                        self.ml_data.em.ae.encoder,
                                        tokenizer=self.ml_data.em.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train
Esempio n. 2
0
    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''

        logger.info("generate embedding feature ")
        # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合
        ###########################################
        #          TODO: module 3 task 1.1        #
        ###########################################
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.em.tfidf,
                                                   self.ml_data.em.w2v)
        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.em.tfidf,
                                                 self.ml_data.em.w2v)

        logger.info("generate autoencoder feature ")
        # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder
        train_ae = get_autoencoder_feature(
            train,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)
        test_ae = get_autoencoder_feature(
            test,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)

        logger.info("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        logger.info("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.root_path + '/data/book_cover/')
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        ###########################################
        #          TODO: module 3 task 1.2        #
        ###########################################
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        logger.info("generate bert feature ")
        ###########################################
        #          TODO: module 3 task 1.3        #
        ###########################################
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        logger.info("generate lda feature ")
        ###########################################
        #          TODO: module 3 task 1.4        #
        ###########################################
        # 生成bag of word格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))

        logger.info("formate data")
        #  将所有的特征拼接到一起
        train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(test, test_tfidf, test_ae)
        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        X_train = train[cols]
        X_test = test[cols]
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        return X_train, X_test, y_train, y_test
Esempio n. 3
0
    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''
        logger.info("generate embedding feature ")
        train_tfidf, test_tfidf, train, test = get_embedding_feature(
            self.ml_data)

        logger.info("generate basic feature ")

        # 1. 获取 基本的 NLP feature
        train = get_basic_feature(train)
        test = get_basic_feature(test)
        print(test.loc[0])

        logger.info("generate modal feature ")
        cover = os.listdir(config.root_path + '/data/book_cover/')
        train['cover'] = train.title.progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test.title.progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 1. 获取 三大CV模型的 modal embedding
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        print(len(test.loc[0, 'res_embedding']))

        #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model))
        #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model))

        #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model))
        #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model))

        logger.info("generate bert feature ")

        # 1. 获取bert embedding
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print(test.loc[0])

        logger.info("generate lda feature ")

        # 1. 获取 lda feature

        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        print(test['queryCutRMStopWord'])
        print(test['bow'])
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))
        print(test['lda'])
        print(test.loc[0])

        logger.info("formate data")
        print(test)
        print(test_tfidf)
        train, test = formate_data(train, test, train_tfidf, test_tfidf)
        print(test)
        print(test.loc[0])

        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        print(cols)
        X_train = train[cols]
        X_test = test[cols]
        print(X_test)
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        print(y_test)
        return X_train, X_test, y_train, y_test