コード例 #1
0
ファイル: MLTrain.py プロジェクト: nongfang55/review
    def RecommendByNativeBayes(train_data, train_data_y, test_data, test_data_y, recommendNum=5, bayesType=1):
        """使用NB
           recommendNum : 推荐数量
           bayesType : 1 Bernoulli
                       2 Gaussian
                       3 Multionmial

        """
        from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
        clf = None
        if bayesType == 2:
            clf = GaussianNB()
        elif bayesType == 3:
            clf = MultinomialNB()
            param = {"alpha": [0.2 * x for x in range(0, 10)], "fit_prior": [False, True]}
            clf = GridSearchCV(clf, param_grid=param)
        elif bayesType == 1:
            clf = BernoulliNB()

        clf.fit(X=train_data, y=train_data_y)
        if bayesType == 3:
            print(clf.best_params_, clf.best_score_)

        """查看算法的学习曲线"""
        MLGraphHelper.plot_learning_curve(clf, 'Bayes', train_data, train_data_y).show()

        pre = clf.predict_proba(test_data)
        # print(clf.classes_)
        pre_class = clf.classes_

        recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum)
        # print(recommendList)
        answer = [[x] for x in test_data_y]
        # print(answer)
        return [recommendList, answer]
コード例 #2
0
    def RecommendByRF(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """多标签分类  随机森林"""

        clf = RandomForestClassifier(n_estimators=50, max_depth=5, n_jobs=-1)
        """对弱分类器数量做调参数量"""
        # param_test1 = {'n_estimators': range(200, 250, 10)}
        # clf = GridSearchCV(estimator=clf, param_grid=param_test1)
        # print(clf.best_params_)
        # print(clf.best_params_, clf.best_score_)
        """对决策树的参数做调参"""
        # param_test2 = {'max_depth': range(6, 8, 1), 'min_samples_split': range(18, 22, 1)}
        # clf = GridSearchCV(estimator=clf, param_grid=param_test1, cv=5, n_jobs=5)

        clf.fit(train_data, train_data_y)

        predictions = clf.predict_proba(test_data)
        # print(clf.best_params_)
        # print(clf.best_score_)
        # print(clf.cv_results_)
        """预测结果转化为data array"""
        predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions)
        print(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
コード例 #3
0
    def RecommendByDT(train_data, train_data_y, test_data, test_data_y, recommendNum=5):

        grid_parameters = [
            {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8]}]  # 调节参数

        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import GridSearchCV
        clf = DecisionTreeClassifier()
        clf = GridSearchCV(clf, param_grid=grid_parameters, n_jobs=-1)
        clf.fit(train_data, train_data_y)

        predictions = clf.predict_proba(test_data)
        print(clf.best_params_)
        """预测结果转化为data array"""
        predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions)
        print(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
コード例 #4
0
ファイル: MLTrain.py プロジェクト: nongfang55/review
    def RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=5, CoreType='rbf', C=1,
                       gamma='auto',
                       decisionShip='ovo'):
        """使用SVM
           recommendNum : 推荐数量
           CoreType : 'linear' 线性
                      'rbf' 高斯
           C: 惩罚系数
           gamma: 核参数lambda
           decisionShip: 分类策略
        """

        """设定判断参数"""

        """训练集按照3 7开分成训练集和交叉验证集"""

        """自定义验证集 而不是使用交叉验证"""

        """这里使用交叉验证还是自定义验证需要再研究一下  3.31"""
        test_fold = numpy.zeros(train_data.shape[0])
        test_fold[:ceil(train_data.shape[0] * 0.7)] = -1
        ps = PredefinedSplit(test_fold=test_fold)

        grid_parameters = [
            {'kernel': ['rbf'], 'gamma': [0.0005, 0.00075, 0.0001],
             'C': [100, 105, 108, 110], 'decision_function_shape': ['ovr']}]
        # {'kernel': ['linear'], 'C': [90, 95, 100],
        #  'decision_function_shape': ['ovr', 'ovo'],
        #  'class_weight': ['balanced', None]}]  # 调节参数

        from sklearn import svm
        from sklearn.model_selection import GridSearchCV
        clf = svm.SVC(C=C, kernel=CoreType, probability=True, gamma=gamma, decision_function_shape=decisionShip)
        """
          因为REVIEW中有特征是时间相关的  所以讲道理nfold不能使用
          需要自定义验证集 如果使用自定义验证集   GridSearchCVA(CV=ps)

        """
        # clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps)  # 网格搜索参数
        clf.fit(X=train_data, y=train_data_y)
        # clf.fit(X=train_features, y=train_label)

        # print(clf.best_params_)

        # clf = svm.SVC(C=100, kernel='linear', probability=True)
        # clf.fit(train_data, train_data_y)

        pre = clf.predict_proba(test_data)
        pre_class = clf.classes_
        # print(pre)
        # print(pre_class)
        """查看算法的学习曲线"""
        MLGraphHelper.plot_learning_curve(clf, 'SVM', train_data, train_data_y).show()

        recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum)
        # print(recommendList.__len__())
        answer = [[x] for x in test_data_y]
        # print(answer.__len__())
        return [recommendList, answer]
コード例 #5
0
ファイル: MLTrain.py プロジェクト: nongfang55/review
    def RecommendByRandomForest(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """使用随机森林
           n_estimators : 最大弱学习器个数
           recommendNum : 推荐数量
           max_depth 决策树最大深度
           min_samples_split 内部节点划分所需最小样本数
           min_samples_leaf 叶子节点最小样本数
           class_weight 分类权重
        """

        """设定判断参数"""

        """自定义验证集 而不是使用交叉验证"""
        test_fold = numpy.zeros(train_data.shape[0])
        test_fold[:ceil(train_data.shape[0] * 0.7)] = -1
        ps = PredefinedSplit(test_fold=test_fold)

        """导入模型"""
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import GridSearchCV
        clf = RandomForestClassifier(min_samples_split=100,
                                     min_samples_leaf=20, max_depth=8, max_features='sqrt', random_state=10)
        # clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1)
        # clf.fit(train_data, train_data_y)
        #
        # print("OOB SCORE:", clf.oob_score_)

        """对弱分类器数量做调参数量"""
        # param_test1 = {'n_estimators': range(10, 200, 10)}
        # clf = GridSearchCV(estimator=clf, param_grid=param_test1)
        # clf.fit(train_data, train_data_y)
        # print(clf.best_params_, clf.best_score_)

        """对决策树的参数做调参"""
        param_test2 = {'max_depth': range(3, 14, 2), 'min_samples_split': range(50, 201, 20)}
        clf = GridSearchCV(estimator=clf, param_grid=param_test2, iid=False, cv=5)
        clf.fit(train_data, train_data_y)
        # gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

        """查看算法的学习曲线"""
        MLGraphHelper.plot_learning_curve(clf, 'RF', train_data, train_data_y).show()

        pre = clf.predict_proba(test_data)
        pre_class = clf.classes_
        # print(pre)
        # print(pre_class)

        recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum)
        # print(recommendList)
        answer = [[x] for x in test_data_y]
        # print(answer)
        return [recommendList, answer]
コード例 #6
0
    def RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """svm 一对多"""
        classifier = SVC(kernel='linear', probability=True, class_weight='balanced', C=70)
        clf = OneVsRestClassifier(classifier)
        clf.fit(train_data, train_data_y)
        predictions = clf.predict_proba(test_data)
        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)

        answerList = test_data_y
        # print(predictions)
        # print(test_data_y)
        # print(recommendList)
        # print(answerList)
        return [recommendList, answerList]
コード例 #7
0
    def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """分类器链"""
        classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20))
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data)
        predictions = predictions.todense().getA()

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
コード例 #8
0
    def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """ML KNN算法"""
        classifier = MLkNN(k=train_data_y.shape[1])
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data).todense()
        """预测结果转化为data array"""
        predictions = numpy.asarray(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
コード例 #9
0
    def RecommendByKN(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """ML  KNeighbors"""
        clf = KNeighborsClassifier()
        clf.fit(train_data, train_data_y)
        predictions = clf.predict_proba(test_data)
        """预测结果转化为data array"""
        predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions)
        print(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
コード例 #10
0
ファイル: MLTrain.py プロジェクト: nongfang55/review
    def RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """使用决策树
           recommendNum : 推荐数量
           max_depth 决策树最大深度
           min_samples_split 内部节点划分所需最小样本数
           min_samples_leaf 叶子节点最小样本数
           class_weight 分类权重
        """

        """设定判断参数"""

        """训练集按照3 7开分成训练集和交叉验证集"""

        """自定义验证集 而不是使用交叉验证"""
        test_fold = numpy.zeros(train_data.shape[0])
        test_fold[:ceil(train_data.shape[0] * 0.7)] = -1
        ps = PredefinedSplit(test_fold=test_fold)

        grid_parameters = [
            {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8],
             'class_weight': [None]}]  # 调节参数

        # # scores = ['precision', 'recall']  # 判断依据

        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import GridSearchCV
        clf = DecisionTreeClassifier()
        clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1)
        clf.fit(train_data, train_data_y)

        print(clf.best_params_)
        # dot_data = export_graphviz(clf, out_file=None)
        # graph = graphviz.Source(dot_data)
        # graph.render("DTree")

        pre = clf.predict_proba(test_data)
        pre_class = clf.classes_
        # print(pre)
        # print(pre_class)

        recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum)
        # print(recommendList)
        answer = [[x] for x in test_data_y]
        # print(answer)
        return [recommendList, answer]
コード例 #11
0
    def RecommendByETS(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """多标签分类  """

        clf = ExtraTreesClassifier(n_jobs=3, n_estimators=250)
        param_test2 = {'max_depth': range(10, 40, 10), 'min_samples_split': range(15, 30, 5)}
        clf = GridSearchCV(estimator=clf, param_grid=param_test2, iid=False, cv=10, n_jobs=2)

        clf.fit(train_data, train_data_y)
        predictions = clf.predict_proba(test_data)
        """预测结果转化为data array"""
        predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions)
        print(predictions)

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]