def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(),
            random_state=0,
            max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(),
            random_state=0,
            max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Exemple #3
0
def cross_validation(name):
    with open('../data/conv_pred/train_data_ad_ignore_' + name + '.pickle',
              'rb') as f:
        data = pickle.load(f)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])
    kf = KFold(n_splits=5)
    fscore = 0
    ftscore = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #model = RandomForestClassifier(n_estimators=100, n_jobs=8,class_weight={0:1,1:3000})
        model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8)
        model.fit(X_train, y_train)
        predict = model.predict_proba(X_test)
        score, t_score = eval(y_test, predict)
        pprint(
            sorted(zip(
                np.mean([
                    est.steps[1][1].feature_importances_
                    for est in model.estimators_
                ],
                        axis=0), v.feature_names_),
                   key=lambda x: x[0],
                   reverse=True))
        print('score : ', str(score))
        print('true_score : ', str(t_score))
        fscore += score
        ftscore += t_score
    print('\n')
    print('final score : ', str(fscore / 10))
    print('final true_score : ', str(ftscore / 10))
def Model_3(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [DPC(i) for i in train['Sequence']]
    X_test = [DPC(i) for i in test['Sequence']]
    Y_train = train['label']

    # Training
    clf = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(
        bootstrap=False, n_estimators=450, random_state=6),
                                    n_estimators=25,
                                    n_jobs=-1,
                                    random_state=6,
                                    verbose=1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_pred = clf.predict(X_test)
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_3.csv", index=False)
    result["Label"] = Y_pred
    result.to_csv("Predictions_3.csv", index=False)
Exemple #5
0
def cross_validation(x):
    with open('../data/conv_pred/train_data_' + x + '.pickle', 'rb') as f:
        data = pickle.load(f)
    print(data)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])

    zero = 0
    one = 0
    for i in y:
        if i == 0:
            zero += 1
        else:
            one += 1
    print(zero)
    print(one)

    cv = 5
    kf = KFold(n_splits=cv)
    fscore = 0
    ftscore = 0
    all_f_value = 0
    all_prec = 0
    for train_index, test_index in tqdm(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #model = RandomForestRe(n_estimators=100, n_jobs=8)
        model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8)
        #model = xgb.XGBClassifier(n_estimators=500,max_delta_step=1,scale_pos_weight=zero/one)
        model.fit(X_train, y_train)
        predict = model.predict_proba(X_test)
        precision, recall, f_value, all_pre = eval(y_test, predict)
        all_prec += all_pre
        fscore += precision
        ftscore += recall
        all_f_value += f_value
    pprint(
        sorted(zip(
            np.mean([
                est.steps[1][1].feature_importances_
                for est in model.estimators_
            ],
                    axis=0), v.feature_names_),
               key=lambda x: x[0],
               reverse=True))
    print('\n')
    print('final precision : ', str(fscore / cv))
    print('final recall : ', str(ftscore / cv))
    print('final f-value : ', str(all_f_value / cv))
    print('final all_precision : ', str(all_prec / cv))
class Classifier(BaseEstimator):
    def __init__(self):
        # mimicking balanced random forest with the BalancedBaggingClassifier
        # and DecisionTreeClassifier combination
        self.bbc = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(max_features='auto'),
            ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1)

    def fit(self, X, y):
        self.bbc.fit(X, y)

    def predict_proba(self, X):
        return self.bbc.predict_proba(X)
Exemple #7
0
def cross_validation_another(x):
    with open('../data/conv_pred/super_train_data_day_' + 'A' + '.pickle',
              'rb') as f:
        data = pickle.load(f)
    with open('../data/conv_pred/super_test_data_day_' + 'A' + '.pickle',
              'rb') as f:
        test = pickle.load(f)
    v = DictVectorizer()
    X_train = v.fit_transform(data['X'])
    y_train = np.array(data['y'])
    X_test = v.transform(test['X'])
    y_test = np.array(test['y'])
    zero = 0
    one = 0
    for i in y_train:
        if i == 0:
            zero += 1
        else:
            one += 1
    print(zero)
    print(one)

    model = BalancedBaggingClassifier(n_estimators=100,
                                      n_jobs=8,
                                      max_samples=0.6)
    #model = xgb.XGBClassifier(n_estimators=500, max_delta_step=1, scale_pos_weight=zero / one)
    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)
    precision, recall, f_value, all_pre = eval(y_test, predict)
    all_prec = all_pre
    fscore = precision
    ftscore = recall
    all_f_value = f_value
    print('\n')
    print('final precision : ', str(fscore))
    print('final recall : ', str(ftscore))
    print('final f-value : ', str(all_f_value))
    print('final all_precision : ', str(all_prec))
Exemple #8
0
    X_train_o = X_train[:, 0:original_len]
    X_test_o = X_test[:, 0:original_len]

    X_train_n = X_train[:, original_len:]
    X_test_n = X_test[:, original_len:]

    for clf, clf_name in zip(clf_list, clf_name_list):
        print('processing', clf_name, 'round', i + 1)
        if clf_name != 'xgb':
            clf = BalancedBaggingClassifier(base_estimator=clf,
                                            ratio='auto',
                                            replacement=False)

        # fully supervised
        clf.fit(X_train_o, y_train.ravel())
        y_pred = clf.predict_proba(X_test_o)

        roc_score = roc_auc_score(y_test, y_pred[:, 1])
        prec_n = get_precn(y_test, y_pred[:, 1])

        result_dict[clf_name + 'ROC' + 'o'].append(roc_score)
        result_dict[clf_name + 'PRC@n' + 'o'].append(prec_n)

        # unsupervised
        clf.fit(X_train_n, y_train.ravel())
        y_pred = clf.predict_proba(X_test_n)

        roc_score = roc_auc_score(y_test, y_pred[:, 1])
        prec_n = get_precn(y_test, y_pred[:, 1])

        result_dict[clf_name + 'ROC' + 'n'].append(roc_score)
Exemple #9
0
class Models(object):
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        '''
        @description: initlize Class, EX: model
        @param {type} :
        feature_engineer: whether using feature engineering, if `False`, then compare common ML models
        res_model: res network model
        resnext_model: resnext network model
        wide_model: wide res network model
        bert: bert model
        ml_data: new mldata class
        @return: No return
        '''
        # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中
        ###########################################
        #          TODO: module 2 task 2.1        #
        ###########################################
        self.res_model = torchvision.models.resnet152(
            pretrained=True)  # res model for modal feature [1* 1000]
        self.res_model = self.res_model.to(config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True)
        self.resnext_model = self.resnext_model.to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True)
        self.wide_model = self.wide_model.to(config.device)
        # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中
        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path + '/model/bert')
        self.bert = self.bert.to(config.device)

        # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练
        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)
        # 如果不训练, 则加载训练好的模型,进行预测
        if train_mode:
            self.model = lgb.LGBMClassifier(objective='multiclass',
                                            n_jobs=10,
                                            num_class=33,
                                            num_leaves=30,
                                            reg_alpha=10,
                                            reg_lambda=200,
                                            max_depth=3,
                                            learning_rate=0.05,
                                            n_estimators=2000,
                                            bagging_freq=1,
                                            bagging_fraction=0.9,
                                            feature_fraction=0.8,
                                            seed=1440)

        else:
            self.load(model_path)
            labelNameToIndex = json.load(
                open(config.root_path + '/data/label2id.json',
                     encoding='utf-8'))
            self.ix2label = {v: k for k, v in labelNameToIndex.items()}

    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''

        logger.info("generate embedding feature ")
        # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合
        ###########################################
        #          TODO: module 3 task 1.1        #
        ###########################################
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.em.tfidf,
                                                   self.ml_data.em.w2v)
        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.em.tfidf,
                                                 self.ml_data.em.w2v)

        logger.info("generate autoencoder feature ")
        # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder
        train_ae = get_autoencoder_feature(
            train,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)
        test_ae = get_autoencoder_feature(
            test,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)

        logger.info("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        logger.info("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.root_path + '/data/book_cover/')
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        ###########################################
        #          TODO: module 3 task 1.2        #
        ###########################################
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        logger.info("generate bert feature ")
        ###########################################
        #          TODO: module 3 task 1.3        #
        ###########################################
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        logger.info("generate lda feature ")
        ###########################################
        #          TODO: module 3 task 1.4        #
        ###########################################
        # 生成bag of word格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))

        logger.info("formate data")
        #  将所有的特征拼接到一起
        train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(test, test_tfidf, test_ae)
        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        X_train = train[cols]
        X_test = test[cols]
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        '''
        @description: use param search tech to find best param
        @param {type}
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        # 使用网格搜索 或者贝叶斯优化 寻找最优参数
        if search_method == 'grid':
            logger.info("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            logger.info("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            logger.info("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        # 生成所有feature
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble
        ###########################################
        #          TODO: module 4 task 1.1        #
        ###########################################
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'
        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')
        # 使用set_params 将搜索到的最优参数设置为模型的参数
        if imbalance_method != 'ensemble':
            ###########################################
            #          TODO: module 4 task 1.2        #
            ###########################################
            # param = self.param_search(search_method=search_method)
            # param['params']['num_leaves'] = int(param['params']['num_leaves'])
            # param['params']['max_depth'] = int(param['params']['max_depth'])
            param = {}
            param['params'] = {}
            param['params']['num_leaves'] = 3
            param['params']['max_depth'] = 5
            self.model = self.model.set_params(**param['params'])
        logger.info('fit model ')
        # 训练, 并输出模型的结果
        self.model.fit(self.X_train, self.y_train)
        ###########################################
        #          TODO: module 4 task 1.3        #
        ###########################################
        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)
        # 输出训练集的精确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)

    def process(self, title, desc):
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x:
            [word for word in x if word not in self.ml_data.em.stopWords])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf,
                                             self.ml_data.em.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''
        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                df.bow))

        print("generate autoencoder feature ")
        df_ae = get_autoencoder_feature(df,
                                        self.ml_data.em.ae.max_features,
                                        self.ml_data.em.ae.max_len,
                                        self.ml_data.em.ae.encoder,
                                        tokenizer=self.ml_data.em.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train

    def predict(self, title, desc):
        '''
        @description: 根据输入的title, desc 预测图书的类别
        @param {type}
        title, input
        desc: input
        @return: label
        '''
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):
        '''
        @description:save model
        @param {type}
        model_name, file name for saving
        @return: None
        '''
        ###########################################
        #          TODO: module 4 task 1.4        #
        ###########################################
        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):
        '''
        @description: load model
        @param {type}
        path: model path
        @return:None
        '''
        ###########################################
        #          TODO: module 4 task 1.4        #
        ###########################################
        self.model = joblib.load(path)
    # ADD CODE HERE
    #first_test(X_train, y_train, X_test, y_test)
    #second_test(X_train, y_train, X_test, y_test)
    third_test(X_train, y_train, X_test, y_test)
    #fourth_test(X_train, y_train, X_test, y_test)
    #fifth_test(X_train, y_train, X_test, y_test)
    #sixth_test(X_train, y_train, X_test, y_test)

    #X, y = SMOTETomek(n_jobs=-1).fit_sample(X_LS, y_LS)
    #do_cv_RF(X, y)

    #score = cross_val_score(model, X_LS, y_LS, cv=10, scoring="roc_auc")
    # print(np.mean(score))
    exit("No need to make submission now")
    with measure_time('Training'):
        model.fit(X_LS, y_LS)

    # PREDICTION
    TS = load_from_csv(args.ts)
    X_TS = create_fingerprints(TS["SMILES"].values)

    # Predict
    y_pred = model.predict_proba(X_TS)[:, 1]

    # Estimated AUC of the model
    auc_predicted = 0.75  # it seems a bit pessimistic, right?

    # Making the submission file
    fname = make_submission(y_pred, auc_predicted, 'Bagging_model')
    print('Submission file "{}" successfully written'.format(fname))
Exemple #11
0
def model_baseline3(x_train, y_train, x_test, y_test):
    bagging = BaggingClassifier(random_state=0)
    balanced_bagging = BalancedBaggingClassifier(random_state=0)
    bagging.fit(x_train, y_train)
    balanced_bagging.fit(x_train, y_train)
    prob = bagging.predict_proba(x_test)[:, 1]
    predict_score = [float('%.2f' % x) for x in prob]
    loss_val = log_loss(y_test, predict_score)
    y_pred = [1 if x > 0.5 else 0 for x in predict_score]
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = interp(mean_fpr, fpr, tpr)
    x_auc = auc(fpr, tpr)
    fig = plt.figure('Bagging')
    ax = fig.add_subplot(1, 1, 1)
    name = 'base_Bagging'
    plt.plot(mean_fpr,
             mean_tpr,
             linestyle='--',
             label='{} (area = %0.2f, logloss = %0.2f)'.format(name) %
             (x_auc, loss_val),
             lw=2)
    y_pred_bagging = bagging.predict(x_test)
    cm_bagging = confusion_matrix(y_test, y_pred_bagging)
    cm1 = plt.figure()
    plot_confusion_matrix(cm_bagging,
                          classes=[0, 1],
                          title='Confusion matrix of BaggingClassifier')
    # balanced_bagging
    prob = balanced_bagging.predict_proba(x_test)[:, 1]
    predict_score = [float('%.2f' % x) for x in prob]
    loss_val = log_loss(y_test, predict_score)
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = interp(mean_fpr, fpr, tpr)
    x_auc = auc(fpr, tpr)
    plt.figure('Bagging')  # 选择图
    name = 'base_Balanced_Bagging'
    plt.plot(mean_fpr,
             mean_tpr,
             linestyle='--',
             label='{} (area = %0.2f, logloss = %0.2f)'.format(name) %
             (x_auc, loss_val),
             lw=2)
    y_pred_balanced_bagging = balanced_bagging.predict(x_test)
    cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
    cm2 = plt.figure()
    plot_confusion_matrix(cm_balanced_bagging,
                          classes=[0, 1],
                          title='Confusion matrix of BalancedBagging')
    plt.figure('Bagging')  # 选择图
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Luck')
    # make nice plotting
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    return cm1, cm2, fig
Exemple #12
0
def buildModel(X, y):
    # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2]))
    print X.shape, y.shape
    scaler = StandardScaler()
    print(scaler.fit(X))
    scaled_train_x = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(scaled_train_x,
                                                        y,
                                                        random_state=19,
                                                        test_size=0.3)

    bag = BalancedBaggingClassifier(n_estimators=200, random_state=19)
    svm = SVC(class_weight='balanced',
              random_state=19,
              decision_function_shape='ovr')
    neural = MLPClassifier(max_iter=500,
                           random_state=19,
                           solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(49, 8, 4))
    ada = AdaBoostClassifier(n_estimators=100, random_state=19)
    logistic = LogisticRegression(solver='lbfgs', max_iter=500)

    bag.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    neural.fit(X_train, y_train)
    ada.fit(X_train, y_train)
    logistic.fit(X_train, y_train)

    # joblib.dump(bag,'bag.pkl')
    # joblib.dump(scaler,'scaler.pkl')

    y_pred = bag.predict(X_test)
    y_pred2 = svm.predict(X_test)
    y_pred3 = neural.predict(X_test)
    y_pred4 = ada.predict(X_test)
    y_pred5 = logistic.predict(X_test)

    print matthews_corrcoef(y_test, y_pred)
    print matthews_corrcoef(y_test, y_pred2)
    print matthews_corrcoef(y_test, y_pred3)
    print matthews_corrcoef(y_test, y_pred4)
    print matthews_corrcoef(y_test, y_pred5)

    print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test, y_pred2)
    print confusion_matrix(y_test, y_pred3)
    print confusion_matrix(y_test, y_pred4)
    print confusion_matrix(y_test, y_pred5)

    print(classification_report_imbalanced(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred2))
    print(classification_report_imbalanced(y_test, y_pred3))
    print(classification_report_imbalanced(y_test, y_pred4))
    print(classification_report_imbalanced(y_test, y_pred5))

    probs_ada = ada.predict_proba(X_test)
    probs_bag = bag.predict_proba(X_test)
    probs_neural = neural.predict_proba(X_test)
    probs_logistic = logistic.predict_proba(X_test)
    probs_svm = svm.decision_function(X_test)

    ROCplot(probs_ada, y_test, "Plots/ROCplotADA-organelle.png")
    ROCplot(probs_logistic, y_test, "Plots/ROCplotLogistic-organelle.png")
    ROCplot(probs_bag, y_test, "Plots/ROCplotBAG-organelle.png")
    ROCplot(probs_neural, y_test, "Plots/ROCplotNeural-organelle.png")
    ROCplot(probs_svm, y_test, "Plots/ROCplotSVM-organelle.png")

    multiROCplot(
        [probs_ada, probs_logistic, probs_bag, probs_neural, probs_svm],
        y_test, "Plots/multiROCplot.png",
        ['AdaBoost', 'Logistic', 'Bagging Classifier', 'MLP', 'SVM'])
Exemple #13
0
class Models(object):
    def __init__(self, feature_engineer=False):
        '''
        @description: initlize Class, EX: model
        @param {type} :
        feature_engineer: whether using feature engineering, if `False`, then compare common ML models
        res_model: res network model
        resnext_model: resnext network model
        wide_model: wide res network model
        bert: bert model
        ml_data: new mldata class
        @return: No return
        '''
        # 1. 使用torchvision 初始化resnet152模型
        # 2. 使用torchvision 初始化 resnext101_32x8d 模型
        # 3. 使用torchvision 初始化  wide_resnet101_2 模型
        # 4. 加载bert 模型
        print("load")
        self.res_model = torchvision.models.resnet152(pretrained=False)
        self.res_model.load_state_dict(
            torch.load(config.root_path +
                       '/model/resnet150/resnet152-b121ed2d.pth'))
        self.res_model = self.res_model.to(config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True)
        self.resnext_model = self.resnext_model.to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True)
        self.wide_model = self.wide_model.to(config.device)

        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path + '/model/bert')
        self.bert = self.bert.to(config.device)
        self.ml_data = MLData(debug_mode=True)
        if feature_engineer:
            self.model = lgb.LGBMClassifier(objective='multiclass',
                                            device='gpu',
                                            n_jobs=10,
                                            num_class=33,
                                            num_leaves=30,
                                            reg_alpha=10,
                                            reg_lambda=200,
                                            max_depth=3,
                                            learning_rate=0.05,
                                            n_estimators=2000,
                                            bagging_freq=1,
                                            bagging_fraction=0.9,
                                            feature_fraction=0.8,
                                            seed=1440)
        else:
            self.models = [
                RandomForestClassifier(n_estimators=500,
                                       max_depth=5,
                                       random_state=0),
                LogisticRegression(solver='liblinear', random_state=0),
                MultinomialNB(),
                SVC(),
                lgb.LGBMClassifier(objective='multiclass',
                                   n_jobs=10,
                                   num_class=33,
                                   num_leaves=30,
                                   reg_alpha=10,
                                   reg_lambda=200,
                                   max_depth=3,
                                   learning_rate=0.05,
                                   n_estimators=2000,
                                   bagging_freq=1,
                                   bagging_fraction=0.8,
                                   feature_fraction=0.8),
            ]

    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''
        logger.info("generate embedding feature ")
        train_tfidf, test_tfidf, train, test = get_embedding_feature(
            self.ml_data)

        logger.info("generate basic feature ")

        # 1. 获取 基本的 NLP feature
        train = get_basic_feature(train)
        test = get_basic_feature(test)
        print(test.loc[0])

        logger.info("generate modal feature ")
        cover = os.listdir(config.root_path + '/data/book_cover/')
        train['cover'] = train.title.progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test.title.progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 1. 获取 三大CV模型的 modal embedding
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        print(len(test.loc[0, 'res_embedding']))

        #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model))
        #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model))

        #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model))
        #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model))

        logger.info("generate bert feature ")

        # 1. 获取bert embedding
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print(test.loc[0])

        logger.info("generate lda feature ")

        # 1. 获取 lda feature

        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        print(test['queryCutRMStopWord'])
        print(test['bow'])
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))
        print(test['lda'])
        print(test.loc[0])

        logger.info("formate data")
        print(test)
        print(test_tfidf)
        train, test = formate_data(train, test, train_tfidf, test_tfidf)
        print(test)
        print(test.loc[0])

        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        print(cols)
        X_train = train[cols]
        X_test = test[cols]
        print(X_test)
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        print(y_test)
        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        '''
        @description: use param search tech to find best param
        @param {type}
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        if search_method == 'grid':
            logger.info("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            logger.info("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            logger.info("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")

            # 1. 使用over_sampling 处理样本不平衡问题
            print(self.y_train)
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            print(self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'

        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")

            # 1. 使用 under_sampling 处理样本不平衡问题
            print(self.X_train)
            #print(self.y_train)
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            print(self.X_train)
            #print(self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'

        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])

        logger.info('fit model ')
        self.model.fit(self.X_train, self.y_train)

        # 1. 预测测试集的label
        # 2. 预测训练机的label
        # 3. 计算percision , accuracy, recall, fi_score

        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的准确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)

    def model_select(self,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     feature_method='tf-idf'):
        '''
        @description: using different embedding feature to train common ML models
        @param {type}
        X_train, feature of train 
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        feature_method, three options , tfidf, word2vec and fasttext
        @return: None
        '''
        for model in self.models:
            model_name = model.__class__.__name__
            print(model_name)
            clf = model.fit(X_train, y_train)
            Test_predict_label = clf.predict(X_test)
            Train_predict_label = clf.predict(X_train)
            per, acc, recall, f1 = get_score(y_train, y_test,
                                             Train_predict_label,
                                             Test_predict_label)
            # 输出训练集的准确率
            logger.info(model_name + '_' + 'Train accuracy %s' % per)

            # 输出测试集的准确率
            logger.info(model_name + '_' + ' test accuracy %s' % acc)

            # 输出recall
            logger.info(model_name + '_' + 'test recall %s' % recall)

            # 输出F1-score
            logger.info(model_name + '_' + 'test F1_score %s' % f1)

    def predict(self, title, desc):

        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):

        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):

        self.model = joblib.load(path)
Exemple #14
0
    random_state=0,
    n_estimators=num_emtimators,
    replacement=True,
    n_jobs=num_jobs)
balanced_RF.fit(xx_train, yy_train)

y_pred = balanced_RF.predict(xx_test)

print('testdataset-BalancedBaggingClassifier:')
print(classification_report_imbalanced(yy_test, y_pred))

y_pred = balanced_RF.predict(xx_train)
print('traindataset-BalancedBaggingClassifier:')
print(classification_report_imbalanced(yy_train, y_pred))

yy_probability = balanced_RF.predict_proba(xx_test)
listFilePath_test = rootpath + 'testlist.list'
L_file = open(listFilePath_test, 'r')
domaindata_path = '/home/shiqiang/feature_extraction/DeepDomFeatures/train/'
k = 0
startLen = 0
for line in L_file:
    if line.strip() == "":
        continue
    chain_name = line.split()[0]
    labelPath = domaindata_path + chain_name + '/' + chain_name + 'new.label'
    test_label = np.loadtxt(labelPath, dtype=np.int64)
    seqLength = test_label.shape[0]
    if seqLength > 700:
        seqLength = 700
    endLen = startLen + seqLength
AUC_model3(best_clf, X_train, y_train, X_test, y_test, n_classes)

# ### Make prediction
# Data to be predicted
data_predict = data2pred.drop(['pat', 'indication'], axis=1)
data_predict_index = data_predict.index
data_predict_pipeline = pd.DataFrame(pipeline.fit_transform(data_predict))
print(data_predict_pipeline.shape)
data_predict_pipeline.index = data_predict_index
data_predict_pipeline.head()

# Final prediction dataset needs to have the same contents as the training and testing set
pred_final_model = pd.DataFrame(
    best_clf.predict(data_predict_pipeline))  ## predicted indications
pred_final_prob = pd.DataFrame(
    best_clf.predict_proba(data_predict_pipeline))  ## predicted probabilities
pred_final_model.index = data_predict_index
pred_final_prob.index = data_predict_index
# Plot top features (use RandomUnderSampler which maybe a bit different from BalancedBaggingClassifier)
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)

# Adapt X_train, y_train
X_train2 = X_resampled.copy()
y_train2 = y_resampled.copy()

GBM_clf.fit(X_train2, y_train2)

# Plot top features
feature_importances = pd.concat([
    pd.DataFrame(x_data_pipeline.columns),
def run_training(fold_):
    total_roc = []
    total_conf = []

    t0 = time.time()
    #df = pd.read_csv("../input/embedded_train_tiny_folds.csv")
    df = pd.read_hdf(path_or_buf="../input/tiny_data/full_data_folds.h5",
                     key='dataset')
    #print("tg\n",df.target.value_counts())
    #print(" ")
    t1 = time.time()
    total_time = t1 - t0
    print("time to read file", total_time)

    print(f"fold: {fold_}")

    t0 = time.time()

    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    #    print("train shape\n", train_df.shape)
    #   print("test shape\n", test_df.shape)

    #features
    xtrain = train_df.drop(["kfold", "target"], axis=1)
    xtest = test_df.drop(["kfold", "target"], axis=1)
    # Standard scaler

    sc = StandardScaler()
    sc.fit(xtrain)

    xtrain = sc.transform(xtrain)
    xtest = sc.transform(xtest)

    # target
    # First make the target binary
    train_df.target = train_df.target.apply(lambda x: 'open'
                                            if x == 'open' else 'closed')

    test_df.target = test_df.target.apply(lambda x: 'open'
                                          if x == 'open' else 'closed')
    ytrain = train_df.target
    ytest = test_df.target

    #model

    n_estimators = 500
    model = BalancedBaggingClassifier(
        linear_model.LogisticRegression(penalty='l2',
                                        C=10,
                                        class_weight='balanced',
                                        max_iter=5000,
                                        solver='liblinear'),
        n_estimators=n_estimators,
        n_jobs=-1,
        max_samples=0.2,
        max_features=0.6,
        # bootstrap_features=True
    )
    #fit the model on training data
    model.fit(xtrain, ytrain)
    # make predictions
    preds = model.predict(xtest)
    preds_proba = model.predict_proba(xtest)[:, 1]
    #print('preds shape',preds_proba.shape)

    t1 = time.time()
    total_time = t1 - t0
    print('time to fit model:', total_time)

    accuracy_score = np.sum(preds == ytest) / len(ytest)

    conf_m = confusion_matrix(ytest, preds)
    print("confusion m\n", conf_m)
    roc_score = roc_auc_score(ytest, preds_proba)
    print('ROC AUC score\n', roc_score)
    t = [fold_, roc_score]
    total_conf.append(conf_m)
    total_roc.append(t)
    test_df.loc[:, "lr_bagging_pred"] = preds_proba

    return test_df[["id", "target", "kfold",
                    "lr_bagging_pred"]], np.mean(total_roc, axis=0)[1]
Exemple #17
0
from imblearn.ensemble import EasyEnsemble
from sklearn.metrics import recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier

from load_data import load_data
import logistic_regression
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from imblearn.ensemble import BalancedBaggingClassifier

from roc import calculate_roc, evaluate

if __name__ == '__main__':
    X_train, y_train = load_data(
        './dataset/car/car-vgood-5-fold/car-vgood-5-2tra.dat')
    X_test, y_test = load_data(
        './dataset/car/car-vgood-5-fold/car-vgood-5-2tst.dat')
    X_train, y_train = map(np.array, [X_train, y_train])
    X_test, y_test = map(np.array, [X_test, y_test])

    bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                    ratio='auto',
                                    replacement=False,
                                    random_state=0)

    bbc.fit(X_train, y_train)
    score = bbc.predict_proba(X_test)
    evaluate(y_test, score)
Exemple #18
0
    def method13_notRec(self, name, test_ids):
        with open('../data/time_weight/fitting_balanced_' + name + '.pickle', 'rb') as f:
            time_weight = pickle.load(f)
        parm_dic = {'A': {'conv': 0, 'click': 0.20701892, 'view': 0.78720054, 'cart': 0.19557122},
                    'B': {'conv': 1, 'click': 0.43314098, 'view': 0.5480186, 'cart': 1},
                    'C': {'conv': 0, 'click': 0, 'view': 0.71978554, 'cart': 1},
                    'D': {'conv': 1, 'click': 0, 'view': 0.82985685, 'cart': 0}}
        if name != 'D':
            with open('../data/matrix/all_time_weighted_' + name + '.pickle', 'rb') as f:
                sparse_data = pickle.load(f)
            with open('../data/matrix/all_id_dic_time_weighted_' + name + '.pickle', 'rb') as f:
                id_dic = pickle.load(f)
            model = NMF(n_components=128, max_iter=1024, tol=0.001)
            user_feature_matrix = model.fit_transform(sparse_data)
            item_feature_matrix = model.components_
            if name != 'C':
                with open('../data/conv_pred/train_data_' + name + '.pickle', 'rb') as f:
                    data = pickle.load(f)
                with open('../data/conv_pred/test_X_cut_origin_' + name + '.pickle', 'rb') as f:
                    name_dic_train = pickle.load(f)
                v = DictVectorizer()
                X = v.fit_transform(data['X'])
                y = np.array(data['y'])

                forest = BalancedBaggingClassifier(n_estimators=500, n_jobs=1,random_state=777)
                forest.fit(X, y)
                forest2 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=1234)
                forest2.fit(X, y)
                forest3 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=1919)
                forest3.fit(X, y)
                forest4 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=114514)
                forest4.fit(X, y)
                forest5 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=334)
                forest5.fit(X, y)
                # with open('../data/conv_pred/train_data_notRec_' + name + '.pickle', 'rb') as f:
                #     data = pickle.load(f)
                # X = v.transform(data['X'])
                # y = np.array(data['y'])
                #
                # notRecforest = BalancedBaggingClassifier(n_estimators=100, n_jobs=1)
                # notRecforest.fit(X, y)
        test_min = datetime.datetime(year=2017, month=5, day=1)
        predict_test = {}
        for i in tqdm.tqdm(test_ids):
            # ユニークitem idを取得
            tmp_dict = {}
            past_items = pd.unique(self.personal_train[name][i]['product_id'])

            # 過去のデータから商品の重みを計算
            for j in past_items:
                tmp_dict[j] = 0
                for _, row in self.personal_train[name][i][
                            self.personal_train[name][i]['product_id'] == j].iterrows():
                    if row['event_type'] == 1:
                        tmp_dict[j] += parm_dic[name]['view'] * time_weight[
                            -1 * (row['time_stamp'] - test_min).days]
                    elif row['event_type'] == 0:
                        tmp_dict[j] += parm_dic[name]['cart'] * time_weight[
                            -1 * (row['time_stamp'] - test_min).days]
                    elif row['event_type'] == 2:
                        tmp_dict[j] += parm_dic[name]['click'] * time_weight[
                            -1 * (row['time_stamp'] - test_min).days]
                    elif row['event_type'] == 3:
                        tmp_dict[j] += parm_dic[name]['conv'] * time_weight[
                            -1 * (row['time_stamp'] - test_min).days]

            sorted_list = sorted(tmp_dict.items(), key=itemgetter(1), reverse=True)
            sorted_list = [x for x, y in sorted_list]
            old_set = sorted_list
            if name == 'D':
                if len(sorted_list) > 22:
                    sorted_list = sorted_list[:22]
                predict_test[i] = sorted_list
            else:
                if name != 'C':
                    sorted_list2 = []
                    input_data = []
                    for k in sorted_list:
                        if k in name_dic_train[i].keys() and len(name_dic_train[i][k]) != 0:
                            sorted_list2.append(k)
                            input_data.append(name_dic_train[i][k])
                    if len(input_data) != 0:
                        X = v.transform(input_data)
                        pred = forest.predict_proba(X)[:,1]
                        pred2 = forest2.predict_proba(X)[:, 1]
                        pred3 = forest3.predict_proba(X)[:, 1]
                        pred4 = forest4.predict_proba(X)[:, 1]
                        pred5 = forest5.predict_proba(X)[:, 1]
                        pred=(pred+pred2+pred3+pred4+pred5)/5
                        #pred_notRec = notRecforest.predict_proba(X)[:,1]
                        conv_list = []
                        rec_list=[]
                        mysort = sorted(zip(sorted_list2, pred), key=lambda x: x[1], reverse=True)
                        #notRecsort = sorted(zip(sorted_list2, pred_notRec), key=lambda x: x[1], reverse=False)
                        # for k in range(len(notRecsort)):
                        #     if notRecsort[k][1] >= 0.5:
                        #         conv_list.append(notRecsort[k][0])
                        for k in range(len(mysort)):
                            if mysort[k][1] >= 0.5:
                                rec_list.append(mysort[k][0])
                        for k in old_set:
                            if k not in rec_list:
                                rec_list.append(k)
                        sorted_list = rec_list
                if len(sorted_list) > 22:
                    sorted_list = sorted_list[:22]
                # elif name == 'A':
                #     for k in conv_list:
                #         if len(sorted_list) >= 22:
                #             break
                #         if k not in sorted_list:
                #             sorted_list.append(k)
                nmf_number = 22 - len(sorted_list)
                if len(sorted_list) > 22:
                    sorted_list = sorted_list[:22]

                if nmf_number > 0:
                    est_user_eval = np.dot(user_feature_matrix[id_dic['user_id'].index(i)], item_feature_matrix)
                    # est_user_eval = cm.dot(cm.CUDAMatrix(user_feature_matrix[id_dic['user_id'].index(i):id_dic['user_id'].index(i) + 1]),cm.CUDAMatrix(item_feature_matrix)).asarray()[0]
                    tmp = sorted(zip(est_user_eval, id_dic['product_id']), key=lambda x: x[0], reverse=True)
                    predict = list(zip(*tmp))[1]

                    add_list = []
                    num = 0
                    while len(add_list) != nmf_number:
                        if predict[num] not in sorted_list:
                            add_list.append(predict[num])
                        num += 1
                    sorted_list.extend(add_list)
                predict_test[i] = sorted_list
        return predict_test
Exemple #19
0
class Models(object):
    """
    获取基于机器学习的文本算法
    """
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中
        self.res_model = torchvision.models.resnet152(pretrained=True).to(
            config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True).to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(
            pretrained=True).to(config.device)

        # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中
        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path +
                                              '/model/bert').to(config.device)

        # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练
        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)

        # 如果不训练, 则加载训练好的模型,进行预测
        if not train_mode:
            self.load(model_path)
            labelNameToIndex = json.load(
                open(config.root_path + '/data/label2id.json',
                     encoding='utf-8'))
            self.ix2label = {v: k for k, v in labelNameToIndex.items()}
        else:
            # 如果 feature_engineer,  则使用lightgbm 进行训练, 反之对比经典机器学习模型
            if feature_engineer:
                self.model = lgb.LGBMClassifier(objective='multiclass',
                                                n_jobs=10,
                                                num_class=33,
                                                num_leaves=30,
                                                reg_alpha=10,
                                                reg_lambda=200,
                                                max_depth=3,
                                                learning_rate=0.05,
                                                n_estimators=2000,
                                                bagging_freq=1,
                                                bagging_fraction=0.9,
                                                feature_fraction=0.8,
                                                seed=1440)
            else:
                self.models = [
                    RandomForestClassifier(n_estimators=500,
                                           max_depth=5,
                                           random_state=0),
                    LogisticRegression(solver='liblinear', random_state=0),
                    MultinomialNB(),
                    SVC(),
                    lgb.LGBMClassifier(objective='multiclass',
                                       n_jobs=10,
                                       num_class=33,
                                       num_leaves=30,
                                       reg_alpha=10,
                                       reg_lambda=200,
                                       max_depth=3,
                                       learning_rate=0.05,
                                       n_estimators=2000,
                                       bagging_freq=1,
                                       bagging_fraction=0.8,
                                       feature_fraction=0.8),
                ]

    def feature_engineer(self):

        print(" generate embedding feature ")

        # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.tfidf,
                                                   self.ml_data.w2v)

        # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为:
        # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300]
        # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300]
        # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300]
        # w2v_mean:[seq, 300] -> [300]
        # w2v_max:[seq, 300] -> [300]
        # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300]
        # w2v_win_3_mean
        # w2v_win_4_mean
        # w2v_win_2_max
        # w2v_win_3_max
        # w2v_win_4_max

        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.tfidf,
                                                 self.ml_data.w2v)

        print("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        print("generate lda feature ")

        # 生成 bag of word 格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...]

        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc),
                test['bow']))
        # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布

        print("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.book_cover_path)
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.book_cover_path + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test.title.progress_apply(
            lambda x: config.book_cover_path + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        # print("generate autoencoder feature ")
        # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder
        # TODO
        # train_ae = get_autoencoder_feature(
        #     train,
        #     self.ml_data.ae.max_features,
        #     self.ml_data.ae.max_len,
        #     self.ml_data.ae.encoder,
        #     tokenizer=self.ml_data.ae.tokenizer)
        # test_ae = get_autoencoder_feature(
        #     test,
        #     self.ml_data.ae.max_fe atures,
        #     self.ml_data.ae.max_len,
        #     self.ml_data.ae.encoder,
        #     tokenizer=self.ml_data.ae.tokenizer)

        print("formate data")

        #  将所有的特征拼接到一起
        train = formate_data(
            train,
            train_tfidf)  # train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(
            test, test_tfidf)  # test = formate_data(test, test_tfidf, test_ae)

        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]

        X_train = train[cols]
        X_test = test[cols]

        print(X_test)

        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)

        y_train = train["labelIndex"]
        y_test = test["labelIndex"]

        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        # 使用网格搜索 或者贝叶斯优化 寻找最优参数
        if search_method == 'grid':
            print("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            print("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            print("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):

        print("get all feature")

        # 生成所有 feature

        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None

        # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble

        if imbalance_method == 'over_sampling':
            print("Use SMOTE deal with unbalance data ")
            # https://www.zhihu.com/question/269698662
            # https://www.cnblogs.com/kamekin/p/9824294.html
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            print("Use ClusterCentroids deal with unbalance data")
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'
        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        print('search best param')

        # 使用 set_params 将搜索到的最优参数设置为模型的参数

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])
        print('fit model ')

        # 训练, 并输出模型的结果

        self.model.fit(self.X_train, self.y_train)
        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的精确率
        print('Train accuracy %s' % per)
        # 输出测试集的准确率
        print('test accuracy %s' % acc)
        # 输出recall
        print('test recall %s' % recall)
        # 输出F1-score
        print('test F1_score %s' % f1)
        self.save(model_name)

    def model_select(self,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     feature_method='tf-idf'):
        # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果
        for model in self.models:
            model_name = model.__class__.__name__
            print(model_name)
            clf = model.fit(X_train, y_train)
            Test_predict_label = clf.predict(X_test)
            Train_predict_label = clf.predict(X_train)
            per, acc, recall, f1 = get_score(y_train, y_test,
                                             Train_predict_label,
                                             Test_predict_label)
            # 输出训练集的准确率
            print(model_name + '_' + 'Train accuracy %s' % per)

            # 输出测试集的准确率
            print(model_name + '_' + ' test accuracy %s' % acc)

            # 输出recall
            print(model_name + '_' + 'test recall %s' % recall)

            # 输出F1-score
            print(model_name + '_' + 'test F1_score %s' % f1)

    def process(self, title, desc):

        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x: [word for word in x if word not in get_stop_word_list()])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf,
                                             self.ml_data.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''

        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow))

        print("generate autoencoder feature ")
        # df_ae = get_autoencoder_feature(df,
        #                                 self.ml_data.ae.max_features,
        #                                 self.ml_data.ae.max_len,
        #                                 self.ml_data.ae.encoder,
        #                                 tokenizer=self.ml_data.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf)  #, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train

    def predict(self, title, desc):
        '''
        @description: 根据输入的title, desc 预测图书的类别
        @param {type}
        title, input
        desc: input
        @return: label
        '''
        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):
        '''
        @description:save model
        @param {type}
        model_name, file name for saving
        @return: None
        '''
        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):
        '''
        @description: load model
        @param {type}
        path: model path
        @return:None
        '''
        self.model = joblib.load(path)
Exemple #20
0
    val_vecs = (joblib.load("./vectorized_data/val_%s" % i) for i in instances)

    # BalancedBaggingClassifier n_estimator and n_jobs params
    params = [(100, -1), (50, -1)]

    for data in train_vecs:
        val_vec = next(val_vecs)  # 'sync' training and validation data
        for param in params:
            print("Processing %s %s" % (data[1], param[0]))
            bb_model = BalancedBaggingClassifier(n_estimators=param[0],
                                                 n_jobs=param[1],
                                                 ratio="not minority")
            print("Fitting...")
            bb_model.fit(data[0], train_labels)
            print("Testing...")
            preds = bb_model.predict_proba(val_vec)

            auc = roc_auc_score(val_labels, preds[:, 1])
            brier = brier_score_loss(val_labels, preds[:, 1])

            results = results.append(
                {
                    "data_file": data[1],
                    "bb_n_est": param[0],
                    "auc": auc,
                    "brier": brier
                },
                ignore_index=True)
            results.to_csv("./classifier_results.csv", index=False)
            print("AUC: %.3f, BRIER: %.3f" % (auc, brier))