Beispiel #1
0
def cv_on_dataset(dataset,
                  feature_param,
                  model_param,
                  ax,
                  result_store,
                  contentid_groups,
                  logger=None,
                  aggregate_method=np.mean):

    assets = read_dataset(dataset)
    kfold = construct_kfold_list(assets, contentid_groups)

    fassembler = FeatureAssembler(
        feature_dict=feature_param.feature_dict,
        feature_option_dict=None,
        assets=assets,
        logger=logger,
        delete_workdir=True,
        result_store=result_store,
        optional_dict=None,
        optional_dict2=None,
        parallelize=True,
        fifo_mode=True,
        # parallelize=False, fifo_mode=False, # VQM
    )
    fassembler.run()
    results = fassembler.results

    for result in results:
        result.set_score_aggregate_method(aggregate_method)

    model_class = TrainTestModel.find_subclass(model_param.model_type)
    # run nested kfold cv for each combintation
    cv_output = ModelCrossValidation.run_kfold_cross_validation(
        model_class,
        model_param.model_param_dict,
        results,
        kfold,
        logger=logger,
    )

    print('Feature parameters: {}'.format(feature_param.feature_dict))
    print('Model type: {}'.format(model_param.model_type))
    print('Model parameters: {}'.format(model_param.model_param_dict))
    print('Stats: {}'.format(
        model_class.format_stats_for_print(cv_output['aggr_stats'])))

    if ax is not None:
        model_class.plot_scatter(ax,
                                 cv_output['aggr_stats'],
                                 content_ids=cv_output['contentids'])
        ax.set_xlabel('True Score')
        ax.set_ylabel("Predicted Score")
        ax.grid()
        ax.set_title("Dataset: {dataset}, Model: {model},\n{stats}".format(
            dataset=dataset.dataset_name,
            model=model_param.model_type,
            stats=model_class.format_stats_for_plot(cv_output['aggr_stats'])))

    return assets, cv_output
Beispiel #2
0
    def test_run_kfold_cross_validation_with_list_input(self):

        train_test_model_class = SklearnRandomForestTrainTestModel
        model_param = {
            'norm_type': 'normalize',
            'n_estimators': 10,
            'random_state': 0
        }

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features,
            [[0, 3, 8], [2, 1, 5], [4, 6, 7]])

        self.assertAlmostEqual(output['aggr_stats']['SRCC'],
                               0.18333333333333335,
                               places=4)
        self.assertAlmostEqual(output['aggr_stats']['PCC'],
                               0.35513638509959689,
                               places=4)
        self.assertAlmostEqual(output['aggr_stats']['KENDALL'],
                               0.1111111111111111,
                               places=3)
        self.assertAlmostEqual(output['aggr_stats']['RMSE'],
                               1.2740400878438387,
                               places=3)
Beispiel #3
0
    def test_run_kfold_cross_validation_libsvmnusvr(self):

        train_test_model_class = LibsvmNusvrTrainTestModel
        model_param = {'norm_type': 'normalize'}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features, 3)

        self.assertAlmostEqual(output['aggr_stats']['SRCC'], 0.31666666666666665, places=4)
        self.assertAlmostEqual(output['aggr_stats']['PCC'], 0.33103132578536021, places=4)
        self.assertAlmostEqual(output['aggr_stats']['KENDALL'], 0.27777777777777779, places=4)
        self.assertAlmostEqual(output['aggr_stats']['RMSE'], 1.2855099934718619, places=4)
Beispiel #4
0
    def test_run_kfold_cross_validation_extratrees(self):

        train_test_model_class = SklearnExtraTreesTrainTestModel
        model_param = {'norm_type': 'normalize', 'random_state': 0, 'n_estimators': 10}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features, 3)

        self.assertAlmostEqual(output['aggr_stats']['SRCC'], 0.17320508075688773, places=4)
        self.assertAlmostEqual(output['aggr_stats']['PCC'], 0.33023719320146966, places=4)
        self.assertAlmostEqual(output['aggr_stats']['KENDALL'], 0.14907119849998599, places=4)
        self.assertAlmostEqual(output['aggr_stats']['RMSE'], 1.3279056191361394, places=4)
Beispiel #5
0
def cv_on_dataset(dataset, feature_param, model_param, ax, result_store,
                  contentid_groups, logger=None, aggregate_method=np.mean):

    assets = read_dataset(dataset)
    kfold = construct_kfold_list(assets, contentid_groups)

    fassembler = FeatureAssembler(
        feature_dict=feature_param.feature_dict,
        feature_option_dict=None,
        assets=assets,
        logger=logger,
        delete_workdir=True,
        result_store=result_store,
        optional_dict=None,
        optional_dict2=None,
        parallelize=True, fifo_mode=True,
        # parallelize=False, fifo_mode=False, # VQM
    )
    fassembler.run()
    results = fassembler.results

    for result in results:
        result.set_score_aggregate_method(aggregate_method)

    model_class = TrainTestModel.find_subclass(model_param.model_type)
    # run nested kfold cv for each combintation
    cv_output = ModelCrossValidation.run_kfold_cross_validation(
        model_class,
        model_param.model_param_dict,
        results,
        kfold,
        logger=logger,
    )

    print 'Feature parameters: {}'.format(feature_param.feature_dict)
    print 'Model type: {}'.format(model_param.model_type)
    print 'Model parameters: {}'.format(model_param.model_param_dict)
    print 'Stats: {}'.format(model_class.format_stats(cv_output['aggr_stats']))

    if ax is not None:
        model_class.plot_scatter(ax, cv_output['aggr_stats'], cv_output['contentids'])
        ax.set_xlabel('True Score')
        ax.set_ylabel("Predicted Score")
        ax.grid()
        ax.set_title("Dataset: {dataset}, Model: {model},\n{stats}".format(
            dataset=dataset.dataset_name,
            model=model_param.model_type,
            stats=model_class.format_stats(cv_output['aggr_stats'])
        ))

    return assets, cv_output
    def test_run_kfold_cross_validation_libsvmnusvr(self):

        print "test k-fold cross validation on libsvmnusvr..."

        train_test_model_class = LibsvmNusvrTrainTestModel
        model_param = {'norm_type': 'normalize'}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features, 3)

        self.assertAlmostEquals(output['aggr_stats']['SRCC'], 0.31666666666666665, places=4)
        self.assertAlmostEquals(output['aggr_stats']['PCC'], 0.33103132578536021, places=4)
        self.assertAlmostEquals(output['aggr_stats']['KENDALL'], 0.27777777777777779, places=4)
        self.assertAlmostEquals(output['aggr_stats']['RMSE'], 1.2855099934718619, places=4)
    def test_run_kfold_cross_validation_extratrees(self):

        print "test k-fold cross validation on extra trees..."

        train_test_model_class = SklearnExtraTreesTrainTestModel
        model_param = {'norm_type':'normalize', 'random_state': 0}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features, 3)

        self.assertAlmostEquals(output['aggr_stats']['SRCC'], 0.17320508075688773, places=4)
        self.assertAlmostEquals(output['aggr_stats']['PCC'], 0.33023719320146966, places=4)
        self.assertAlmostEquals(output['aggr_stats']['KENDALL'], 0.14907119849998599, places=4)
        self.assertAlmostEquals(output['aggr_stats']['RMSE'], 1.3279056191361394, places=4)
    def test_run_kfold_cross_validation_randomforest(self):

        print "test k-fold cross validation on random forest..."

        train_test_model_class = SklearnRandomForestTrainTestModel
        model_param = {'norm_type':'normalize', 'random_state': 0}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features, 3)

        self.assertAlmostEquals(output['aggr_stats']['SRCC'], 0.28452131897694583, places=4)
        self.assertAlmostEquals(output['aggr_stats']['PCC'], 0.1689046198483892, places=4)
        self.assertAlmostEquals(output['aggr_stats']['KENDALL'], 0.084515425472851652, places=4)
        self.assertAlmostEquals(output['aggr_stats']['RMSE'], 1.344683833136588, places=4)
Beispiel #9
0
    def test_run_kfold_cross_validation_randomforest(self):

        print "test k-fold cross validation on random forest..."

        train_test_model_class = SklearnRandomForestTrainTestModel
        model_param = {'norm_type':'normalize', 'random_state': 0}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features, 3)

        self.assertAlmostEquals(output['aggr_stats']['SRCC'], 0.28452131897694583, places=4)
        self.assertAlmostEquals(output['aggr_stats']['PCC'], 0.1689046198483892, places=4)
        self.assertAlmostEquals(output['aggr_stats']['KENDALL'], 0.084515425472851652, places=4)
        self.assertAlmostEquals(output['aggr_stats']['RMSE'], 1.344683833136588, places=4)
    def test_run_kfold_cross_validation_with_list_input(self):

        print "test k-fold cross validation with list input..."

        train_test_model_class = SklearnRandomForestTrainTestModel
        model_param = {'norm_type':'normalize', 'random_state': 0}

        output = ModelCrossValidation.run_kfold_cross_validation(
            train_test_model_class, model_param, self.features,
            [[0, 3, 8], [2, 1, 5], [4, 6, 7]])

        self.assertAlmostEquals(output['aggr_stats']['SRCC'], 0.18333333333333335, places=4)
        self.assertAlmostEquals(output['aggr_stats']['PCC'], 0.35513638509959689, places=4)
        self.assertAlmostEquals(output['aggr_stats']['KENDALL'], 0.1111111111111111, places=3)
        self.assertAlmostEquals(output['aggr_stats']['RMSE'], 1.2740400878438387, places=3)