コード例 #1
0
def test_retrieve_cv_folds():
    """
    Test to make sure that the fold ids get returned correctly after cross-validation
    """

    # Setup
    learner = Learner('LogisticRegression')
    num_folds = 5
    cv_fs, custom_cv_folds = make_cv_folds_data(num_examples_per_fold=2, num_folds=num_folds)

    # Test 1: learner.cross_validate() makes the folds itself.
    expected_fold_ids = {'EXAMPLE_0': '0',
                         'EXAMPLE_1': '4',
                         'EXAMPLE_2': '3',
                         'EXAMPLE_3': '1',
                         'EXAMPLE_4': '2',
                         'EXAMPLE_5': '2',
                         'EXAMPLE_6': '1',
                         'EXAMPLE_7': '0',
                         'EXAMPLE_8': '4',
                         'EXAMPLE_9': '3'}
    _, _, _, skll_fold_ids = learner.cross_validate(cv_fs,
                                                    stratified=True,
                                                    cv_folds=num_folds,
                                                    grid_search=True,
                                                    grid_objective='f1_score_micro',
                                                    shuffle=False,
                                                    save_cv_folds=True)
    assert_equal(skll_fold_ids, expected_fold_ids)

    # Test 2: if we pass in custom fold ids, those are also preserved.
    _, _, _, skll_fold_ids = learner.cross_validate(cv_fs,
                                                    stratified=True,
                                                    cv_folds=custom_cv_folds,
                                                    grid_search=True,
                                                    grid_objective='f1_score_micro',
                                                    shuffle=False,
                                                    save_cv_folds=True)
    assert_equal(skll_fold_ids, custom_cv_folds)

    # Test 3: when learner.cross_validate() makes the folds but stratified=False
    # and grid_search=False, so that KFold is used.
    expected_fold_ids = {'EXAMPLE_0': '0',
                         'EXAMPLE_1': '0',
                         'EXAMPLE_2': '1',
                         'EXAMPLE_3': '1',
                         'EXAMPLE_4': '2',
                         'EXAMPLE_5': '2',
                         'EXAMPLE_6': '3',
                         'EXAMPLE_7': '3',
                         'EXAMPLE_8': '4',
                         'EXAMPLE_9': '4'}
    _, _, _, skll_fold_ids = learner.cross_validate(cv_fs,
                                                    stratified=False,
                                                    cv_folds=num_folds,
                                                    grid_search=False,
                                                    shuffle=False,
                                                    save_cv_folds=True)
    assert_equal(skll_fold_ids, custom_cv_folds)
コード例 #2
0
def test_retrieve_cv_folds():
    """
    Test to make sure that the fold ids get returned correctly after cross-validation
    """

    # Setup
    learner = Learner('LogisticRegression')
    num_folds = 5
    cv_fs, custom_cv_folds = make_cv_folds_data(num_examples_per_fold=2, num_folds=num_folds)

    # Test 1: learner.cross_validate() makes the folds itself.
    expected_fold_ids = {'EXAMPLE_0': '0',
                         'EXAMPLE_1': '4',
                         'EXAMPLE_2': '3',
                         'EXAMPLE_3': '1',
                         'EXAMPLE_4': '2',
                         'EXAMPLE_5': '2',
                         'EXAMPLE_6': '1',
                         'EXAMPLE_7': '0',
                         'EXAMPLE_8': '4',
                         'EXAMPLE_9': '3'}
    _, _, _, skll_fold_ids, _ = learner.cross_validate(cv_fs,
                                                       stratified=True,
                                                       cv_folds=num_folds,
                                                       grid_search=True,
                                                       grid_objective='f1_score_micro',
                                                       shuffle=False,
                                                       save_cv_folds=True)
    assert_equal(skll_fold_ids, expected_fold_ids)

    # Test 2: if we pass in custom fold ids, those are also preserved.
    _, _, _, skll_fold_ids, _ = learner.cross_validate(cv_fs,
                                                       stratified=True,
                                                       cv_folds=custom_cv_folds,
                                                       grid_search=True,
                                                       grid_objective='f1_score_micro',
                                                       shuffle=False,
                                                       save_cv_folds=True)
    assert_equal(skll_fold_ids, custom_cv_folds)

    # Test 3: when learner.cross_validate() makes the folds but stratified=False
    # and grid_search=False, so that KFold is used.
    expected_fold_ids = {'EXAMPLE_0': '0',
                         'EXAMPLE_1': '0',
                         'EXAMPLE_2': '1',
                         'EXAMPLE_3': '1',
                         'EXAMPLE_4': '2',
                         'EXAMPLE_5': '2',
                         'EXAMPLE_6': '3',
                         'EXAMPLE_7': '3',
                         'EXAMPLE_8': '4',
                         'EXAMPLE_9': '4'}
    _, _, _, skll_fold_ids, _ = learner.cross_validate(cv_fs,
                                                       stratified=False,
                                                       cv_folds=num_folds,
                                                       grid_search=False,
                                                       shuffle=False,
                                                       save_cv_folds=True)
    assert_equal(skll_fold_ids, custom_cv_folds)
コード例 #3
0
def check_bad_xval_float_classes(do_stratified_xval):

    float_class_fs = make_float_class_data()
    prediction_prefix = join(_my_dir, 'output', 'float_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(float_class_fs,
                           stratified=do_stratified_xval,
                           grid_objective='accuracy',
                           prediction_prefix=prediction_prefix)
コード例 #4
0
def check_bad_xval_float_classes(do_stratified_xval):

    float_class_fs = make_float_class_data()
    prediction_prefix = join(_my_dir, 'output', 'float_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(float_class_fs,
                           stratified=do_stratified_xval,
                           grid_search=True,
                           grid_objective='accuracy',
                           prediction_prefix=prediction_prefix)
コード例 #5
0
def test_rare_class():
    """
    Test cross-validation when some labels are very rare
    """

    rare_class_fs = make_rare_class_data()
    prediction_prefix = join(_my_dir, 'output', 'rare_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(rare_class_fs,
                           grid_objective='unweighted_kappa',
                           prediction_prefix=prediction_prefix)

    with open(prediction_prefix + '_predictions.tsv', 'r') as f:
        reader = csv.reader(f, dialect='excel-tab')
        next(reader)
        pred = [row[1] for row in reader]

        eq_(len(pred), 15)
コード例 #6
0
def test_xval_float_classes_as_strings():
    """
    Test that classification with float labels encoded as strings works
    """

    float_class_fs = make_float_class_data(labels_as_strings=True)
    prediction_prefix = join(_my_dir, 'output', 'float_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(float_class_fs,
                           grid_objective='accuracy',
                           prediction_prefix=prediction_prefix)

    with open(prediction_prefix + '_predictions.tsv', 'r') as f:
        reader = csv.reader(f, dialect='excel-tab')
        next(reader)
        pred = [row[1] for row in reader]
        for p in pred:
            assert p in ['1.2', '1.5', '1.8']
コード例 #7
0
def test_rare_class():
    """
    Test cross-validation when some labels are very rare
    """

    rare_class_fs = make_rare_class_data()
    prediction_prefix = join(_my_dir, 'output', 'rare_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(rare_class_fs,
                           grid_objective='unweighted_kappa',
                           prediction_prefix=prediction_prefix)

    with open(prediction_prefix + '_predictions.tsv', 'r') as f:
        reader = csv.reader(f, dialect='excel-tab')
        next(reader)
        pred = [row[1] for row in reader]

        eq_(len(pred), 15)
コード例 #8
0
ファイル: test_classification.py プロジェクト: ofergold/skll
def test_float_classes():
    """
    Test classification with labels that look like floats
    Make sure that they have been converted to strings as expected
    """

    float_class_fs = make_float_class_data()
    prediction_prefix = join(_my_dir, 'output', 'float_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(float_class_fs,
                           grid_objective='accuracy',
                           prediction_prefix=prediction_prefix)

    with open(prediction_prefix + '.predictions', 'r') as f:
        reader = csv.reader(f, dialect='excel-tab')
        next(reader)
        pred = [row[1] for row in reader]
        for p in pred:
            assert p in ['1.2', '1.5', '1.8']
コード例 #9
0
def test_xval_float_classes_as_strings():
    """
    Test that classification with float labels encoded as strings works
    """

    float_class_fs = make_float_class_data(labels_as_strings=True)
    prediction_prefix = join(_my_dir, 'output', 'float_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(float_class_fs,
                           grid_search=True,
                           grid_objective='accuracy',
                           prediction_prefix=prediction_prefix)

    with open(prediction_prefix + '_predictions.tsv', 'r') as f:
        reader = csv.reader(f, dialect='excel-tab')
        next(reader)
        pred = [row[1] for row in reader]
        for p in pred:
            assert p in ['1.2', '1.5', '1.8']
コード例 #10
0
def test_float_classes():
    """
    Test classification with labels that look like floats
    Make sure that they have been converted to strings as expected
    """

    float_class_fs = make_float_class_data()
    prediction_prefix = join(_my_dir, 'output', 'float_class')
    learner = Learner('LogisticRegression')
    learner.cross_validate(float_class_fs,
                           grid_objective='accuracy',
                           prediction_prefix=prediction_prefix)

    with open(prediction_prefix + '.predictions', 'r') as f:
        reader = csv.reader(f, dialect='excel-tab')
        next(reader)
        pred = [row[1] for row in reader]
        for p in pred:
            assert p in ['1.2', '1.5', '1.8']
コード例 #11
0
def check_learner_api_grid_search_no_objective(task='train'):

    (train_fs, test_fs) = make_classification_data(num_examples=500,
                                                   train_test_ratio=0.7,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)
    learner = Learner('LogisticRegression')
    if task == 'train':
        _ = learner.train(train_fs)
    else:
        _ = learner.cross_validate(train_fs)
コード例 #12
0
def check_learner_api_grid_search_no_objective(task='train'):

    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner = Learner('LogisticRegression')
    if task == 'train':
        _ = learner.train(train_fs)
    else:
        _ = learner.cross_validate(train_fs)
コード例 #13
0
ファイル: test_cv.py プロジェクト: latuji/skll
def test_retrieve_cv_folds():
    """
    Test to make sure that the fold ids get returned correctly after cross-validation
    """

    learner = Learner('LogisticRegression')
    num_folds = 5
    cv_fs, custom_cv_folds = make_cv_folds_data(num_examples_per_fold=2,
                                                num_folds=num_folds)

    # First test where learner.cross_validate makes the folds itself
    expected_fold_ids = {
        'EXAMPLE_0': '0',
        'EXAMPLE_1': '4',
        'EXAMPLE_2': '3',
        'EXAMPLE_3': '1',
        'EXAMPLE_4': '2',
        'EXAMPLE_5': '2',
        'EXAMPLE_6': '1',
        'EXAMPLE_7': '0',
        'EXAMPLE_8': '4',
        'EXAMPLE_9': '3'
    }

    _, _, skll_fold_ids = learner.cross_validate(cv_fs,
                                                 cv_folds=num_folds,
                                                 save_cv_folds=True,
                                                 grid_search=True)

    assert_equal(skll_fold_ids, expected_fold_ids)

    # Now test that if we pass in custom fold ids, those are also preserved
    _, _, skll_fold_ids = learner.cross_validate(cv_fs,
                                                 cv_folds=custom_cv_folds,
                                                 save_cv_folds=True,
                                                 grid_search=True)

    assert_equal(skll_fold_ids, custom_cv_folds)
コード例 #14
0
ファイル: test_cv.py プロジェクト: ChristianGeng/skll
def test_retrieve_cv_folds():
    """
    Test to make sure that the fold ids get returned correctly after cross-validation
    """

    learner = Learner('LogisticRegression')
    num_folds = 5
    cv_fs, custom_cv_folds = make_cv_folds_data(num_examples_per_fold=2, num_folds=num_folds)

    # First test where learner.cross_validate makes the folds itself
    expected_fold_ids = {'EXAMPLE_0': '0', 'EXAMPLE_1': '4', 'EXAMPLE_2': '3', 'EXAMPLE_3': '1',
                         'EXAMPLE_4': '2', 'EXAMPLE_5': '2', 'EXAMPLE_6': '1', 'EXAMPLE_7': '0',
                         'EXAMPLE_8': '4', 'EXAMPLE_9': '3'}

    _, _, skll_fold_ids = learner.cross_validate(cv_fs, cv_folds=num_folds, save_cv_folds=True,
                                                 grid_search=True)

    assert_equal(skll_fold_ids, expected_fold_ids)

    # Now test that if we pass in custom fold ids, those are also preserved
    _, _, skll_fold_ids = learner.cross_validate(cv_fs, cv_folds=custom_cv_folds, save_cv_folds=True,
                                                 grid_search=True)

    assert_equal(skll_fold_ids, custom_cv_folds)
コード例 #15
0
def test_specified_cv_folds():
    """
    Test to check cross-validation results with specified folds, feature hashing, and RBFSampler
    """
    # This runs four tests.

    # The first does not use feature hashing with 9 features (3 numeric, 6
    # binary) has pre-specified folds and has less than 60% accuracy for each
    # of the 3 folds.

    # The second uses feature hashing with 4 features, uses 10 folds (not pre-
    # specified) and has more than 70% accuracy accuracy for each of the 10
    # folds.

    # The third is the same as the first but uses an RBFSampler.

    # The fourth is the same as the second but uses an RBFSampler.

    for test_value, assert_func, expected_folds, use_hashing, use_sampler in \
            [(0.58, assert_less, 3, False, False),
             (0.1, assert_greater, 10, True, False),
             (0.57, assert_less, 3, False, True),
             (0.69, assert_greater, 10, True, True)]:

        sampler = 'RBFSampler' if use_sampler else None
        learner = Learner('LogisticRegression', sampler=sampler)
        cv_fs, custom_cv_folds = make_cv_folds_data(
            use_feature_hashing=use_hashing)
        folds = custom_cv_folds if not use_hashing else 10
        (grid_scores, _, _, _) = \
            learner.cross_validate(cv_fs,
                                   cv_folds=folds,
                                   grid_search=True,
                                   grid_objective='f1_score_micro')
        fold_test_scores = [t[-2] for t in grid_scores]

        overall_score = np.mean(fold_test_scores)

        assert_func(overall_score, test_value)

        eq_(len(fold_test_scores), expected_folds)
        for fold_score in fold_test_scores:
            assert_func(fold_score, test_value)
コード例 #16
0
ファイル: test_cv.py プロジェクト: yiyiwang515/skll
def test_specified_cv_folds():
    """
    Test to check cross-validation results with specified folds, feature hashing, and RBFSampler
    """
    # This runs four tests.

    # The first does not use feature hashing with 9 features (3 numeric, 6
    # binary) has pre-specified folds and has less than 60% accuracy for each
    # of the 3 folds.

    # The second uses feature hashing with 4 features, uses 10 folds (not pre-
    # specified) and has more than 70% accuracy accuracy for each of the 10
    # folds.

    # The third is the same as the first but uses an RBFSampler.

    # The fourth is the same as the second but uses an RBFSampler.

    for test_value, assert_func, expected_folds, use_hashing, use_sampler in \
            [(0.58, assert_less, 3, False, False),
             (0.1, assert_greater, 10, True, False),
             (0.57, assert_less, 3, False, True),
             (0.69, assert_greater, 10, True, True)]:

        sampler = 'RBFSampler' if use_sampler else None
        learner = Learner('LogisticRegression', sampler=sampler)
        cv_fs, custom_cv_folds = make_cv_folds_data(
            use_feature_hashing=use_hashing)
        folds = custom_cv_folds if not use_hashing else 10
        (grid_scores, _, _, _) = \
            learner.cross_validate(cv_fs,
                                   cv_folds=folds,
                                   grid_search=True,
                                   grid_objective='f1_score_micro')
        fold_test_scores = [t[-2] for t in grid_scores]

        overall_score = np.mean(fold_test_scores)

        assert_func(overall_score, test_value)

        eq_(len(fold_test_scores), expected_folds)
        for fold_score in fold_test_scores:
            assert_func(fold_score, test_value)
コード例 #17
0
ファイル: test_cv.py プロジェクト: ClimbsRocks/skll
def test_specified_cv_folds():
    """
    Test to check cross-validation results with specified folds, feature hashing, and RBFSampler
    """
    # This runs four tests.

    # The first does not use feature hashing with 9 features (3 numeric, 6
    # binary) has pre-specified folds and has less than 60% accuracy for each
    # of the 3 folds.

    # The second uses feature hashing with 4 features, uses 10 folds (not pre-
    # specified) and has more than 70% accuracy accuracy for each of the 10
    # folds.

    # The third is the same as the first but uses an RBFSampler.

    # The fourth is the same as the second but uses an RBFSampler.

    for test_value, assert_func, grid_size, use_hashing, use_sampler in [
        (0.55, assert_less, 3, False, False),
        (0.1, assert_greater, 10, True, False),
        (0.53, assert_less, 3, False, True),
        (0.7, assert_greater, 10, True, True),
    ]:

        sampler = "RBFSampler" if use_sampler else None
        learner = Learner("LogisticRegression", sampler=sampler)
        cv_fs, custom_cv_folds = make_cv_folds_data(use_feature_hashing=use_hashing)
        folds = custom_cv_folds if not use_hashing else 10
        cv_output = learner.cross_validate(cv_fs, cv_folds=folds, grid_search=True)
        fold_test_scores = [t[-1] for t in cv_output[0]]

        overall_score = np.mean(fold_test_scores)

        assert_func(overall_score, test_value)

        eq_(len(fold_test_scores), grid_size)
        for fold_score in fold_test_scores:
            assert_func(fold_score, test_value)
コード例 #18
0
def _classify_featureset(args):
    """
    Classification job to be submitted to grid.

    Parameters
    ----------
    args : dict
        A dictionary with arguments for classifying the
        ``FeatureSet`` instance.

    Returns
    -------
    res : list of dicts
        The results of the classification, in the format
        of a list of dictionaries.

    Raises
    ------
    ValueError
        If extra unknown arguments are passed to the function.
    """

    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)

    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    output_metrics = args.pop("output_metrics")
    suffix = args.pop("suffix")
    job_log_file = args.pop("log_file")
    job_log_level = args.pop("log_level")
    probability = args.pop("probability")
    pipeline = args.pop("pipeline")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    folds_file = args.pop("folds_file")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    save_cv_folds = args.pop("save_cv_folds")
    save_cv_models = args.pop("save_cv_models")
    use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    custom_metric_path = args.pop("custom_metric_path")
    quiet = args.pop('quiet', False)
    learning_curve_cv_folds = args.pop("learning_curve_cv_folds")
    learning_curve_train_sizes = args.pop("learning_curve_train_sizes")

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    # create a new SKLL logger for this specific job and
    # use the given log level
    logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level)

    try:

        # log messages
        logger.info("Task: {}".format(task))

        # check if we have any possible custom metrics
        possible_custom_metric_names = []
        for metric_name in output_metrics + [grid_objective]:
            # metrics that are not in `SCORERS` or `None` are candidates
            # (the `None` is a by-product of how jobs with single tuning
            # objectives are created)
            if metric_name not in SCORERS and metric_name is not None:
                possible_custom_metric_names.append(metric_name)
            # if the metric is already in `SCORERS`, is it a custom one
            # that we already registered? if so, log that
            elif metric_name in _CUSTOM_METRICS:
                logger.info(
                    f"custom metric '{metric_name}' is already registered")

        # initialize list that will hold any invalid metrics
        # that we could not register as custom metrics
        invalid_metric_names = []

        # if we have possible custom metrics
        if possible_custom_metric_names:

            # check that we have a file to load them from
            if not custom_metric_path:
                raise ValueError(
                    f"invalid metrics specified: {possible_custom_metric_names}"
                )
            else:
                # try to register each possible custom metric
                # raise an exception if we fail, if we don't then
                # add the custom metric function to `globals()` so
                # that it serializes properly for gridmap
                for custom_metric_name in possible_custom_metric_names:
                    try:
                        custom_metric_func = register_custom_metric(
                            custom_metric_path, custom_metric_name)
                    except (AttributeError, NameError, ValueError):
                        invalid_metric_names.append(custom_metric_name)
                    else:
                        logger.info(f"registered '{custom_metric_name}' as a "
                                    f"custom metric")
                        globals()[custom_metric_name] = custom_metric_func

        # raise an error if we have any invalid metrics
        if invalid_metric_names:
            raise ValueError(
                f"invalid metrics specified: {invalid_metric_names}. "
                f"If these are custom metrics, check the function "
                f"names.")

        if task == 'cross_validate':
            if isinstance(cv_folds, int):
                num_folds = cv_folds
            else:  # folds_file was used, so count the unique fold ids.
                num_folds = len(set(cv_folds.values()))
            logger.info("Cross-validating ({} folds) on {}, feature "
                        "set {} ...".format(num_folds, train_set_name,
                                            featureset))
        elif task == 'evaluate':
            logger.info("Training on {}, Test on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))
        elif task == 'train':
            logger.info("Training on {}, feature set {} ...".format(
                train_set_name, featureset))
        elif task == 'learning_curve':
            logger.info("Generating learning curve "
                        "({} 80/20 folds, sizes={}, objective={}) on {}, "
                        "feature set {} ...".format(
                            learning_curve_cv_folds,
                            learning_curve_train_sizes, grid_objective,
                            train_set_name, featureset))
        else:  # predict
            logger.info("Training on {}, Making predictions on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if (task in ['cross_validate', 'learning_curve']
                or not exists(modelfile) or overwrite):
            train_examples = load_featureset(train_path,
                                             featureset,
                                             suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet,
                                             class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features,
                                             logger=logger)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              pipeline=pipeline,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path,
                              logger=logger)

        # load the model if it already exists
        else:
            # import custom learner into global namespace if we are reusing
            # a saved model
            if custom_learner_path:
                globals()[learner_name] = load_custom_learner(
                    custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                logger.info("Loading pre-existing {} model: {}".format(
                    learner_name, modelfile))
            learner = Learner.from_file(modelfile)

            # attach the job logger to this learner
            learner.logger = logger

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = load_featureset(test_path,
                                            featureset,
                                            suffix,
                                            label_col=label_col,
                                            id_col=id_col,
                                            ids_to_floats=ids_to_floats,
                                            quiet=quiet,
                                            class_map=class_map,
                                            feature_hasher=feature_hasher,
                                            num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # compute information about xval and grid folds that can be put in results
        # in readable form
        if isinstance(cv_folds, dict):
            cv_folds_to_print = '{} via folds file'.format(
                len(set(cv_folds.values())))
        else:
            cv_folds_to_print = str(cv_folds)

        if isinstance(grid_search_folds, dict):
            grid_search_folds_to_print = \
                '{} via folds file'.format(len(set(grid_search_folds.values())))
        else:
            grid_search_folds_to_print = str(grid_search_folds)

        # create a list of dictionaries of the results information
        learner_result_dict_base = {
            'experiment_name':
            experiment_name,
            'train_set_name':
            train_set_name,
            'train_set_size':
            train_set_size,
            'test_set_name':
            test_set_name,
            'test_set_size':
            test_set_size,
            'featureset':
            json.dumps(featureset),
            'featureset_name':
            featureset_name,
            'shuffle':
            shuffle,
            'learner_name':
            learner_name,
            'task':
            task,
            'start_timestamp':
            start_timestamp.strftime('%d %b %Y %H:%M:'
                                     '%S.%f'),
            'version':
            __version__,
            'feature_scaling':
            feature_scaling,
            'folds_file':
            folds_file,
            'grid_search':
            grid_search,
            'grid_objective':
            grid_objective,
            'grid_search_folds':
            grid_search_folds_to_print,
            'min_feature_count':
            min_feature_count,
            'cv_folds':
            cv_folds_to_print,
            'using_folds_file':
            isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict),
            'save_cv_folds':
            save_cv_folds,
            'save_cv_models':
            save_cv_models,
            'use_folds_file_for_grid_search':
            use_folds_file_for_grid_search,
            'stratified_folds':
            stratified_folds,
            'scikit_learn_version':
            SCIKIT_VERSION
        }

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            logger.info('Cross-validating')
            (
                task_results, grid_scores, grid_search_cv_results_dicts,
                skll_fold_ids, models
            ) = learner.cross_validate(
                train_examples,
                shuffle=shuffle,
                stratified=stratified_folds,
                prediction_prefix=prediction_prefix,
                grid_search=grid_search,
                grid_search_folds=grid_search_folds,
                cv_folds=cv_folds,
                grid_objective=grid_objective,
                output_metrics=output_metrics,
                param_grid=param_grid,
                grid_jobs=grid_search_jobs,
                save_cv_folds=save_cv_folds,
                save_cv_models=save_cv_models,
                use_custom_folds_for_grid_search=use_folds_file_for_grid_search
            )
            if models:
                for index, m in enumerate(models, start=1):
                    modelfile = join(model_path,
                                     '{}_fold{}.model'.format(job_name, index))
                    m.save(modelfile)
        elif task == 'learning_curve':
            logger.info("Generating learning curve(s)")
            (curve_train_scores, curve_test_scores,
             computed_curve_train_sizes) = learner.learning_curve(
                 train_examples,
                 grid_objective,
                 cv_folds=learning_curve_cv_folds,
                 train_sizes=learning_curve_train_sizes)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                logger.info("Featurizing and training new {} model".format(
                    learner_name))

                (best_score, grid_search_cv_results) = learner.train(
                    train_examples,
                    shuffle=shuffle,
                    grid_search=grid_search,
                    grid_search_folds=grid_search_folds,
                    grid_objective=grid_objective,
                    param_grid=param_grid,
                    grid_jobs=grid_search_jobs)
                grid_scores = [best_score]
                grid_search_cv_results_dicts = [grid_search_cv_results]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    logger.info("Best {} grid search score: {}".format(
                        grid_objective, round(best_score, 3)))
            else:
                grid_scores = [None]
                grid_search_cv_results_dicts = [None]

            # print out the parameters
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         learner.model.get_params().items())
            logger.info("Hyperparameters: {}".format(', '.join(param_out)))

            # run on test set or cross-validate on training data,
            # depending on what was asked for
            if task == 'evaluate':
                logger.info("Evaluating predictions")
                task_results = [
                    learner.evaluate(test_examples,
                                     prediction_prefix=prediction_prefix,
                                     grid_objective=grid_objective,
                                     output_metrics=output_metrics)
                ]
            elif task == 'predict':
                logger.info("Writing predictions")
                # we set `class_labels` to `False` so that if the learner is
                # probabilistic, probabilities are written instead of labels
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix,
                                class_labels=False)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               grid_search_cv_results_dicts,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path, '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)

        elif task == 'learning_curve':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = {}
            res.update(learner_result_dict_base)
            res.update({
                'learning_curve_cv_folds':
                learning_curve_cv_folds,
                'given_curve_train_sizes':
                learning_curve_train_sizes,
                'learning_curve_train_scores_means':
                np.mean(curve_train_scores, axis=1),
                'learning_curve_test_scores_means':
                np.mean(curve_test_scores, axis=1),
                'learning_curve_train_scores_stds':
                np.std(curve_train_scores, axis=1, ddof=1),
                'learning_curve_test_scores_stds':
                np.std(curve_test_scores, axis=1, ddof=1),
                'computed_curve_train_sizes':
                computed_curve_train_sizes
            })

            # we need to return and write out a list of dictionaries
            res = [res]

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

        # For all other tasks, i.e. train or predict
        else:
            if results_path:
                results_json_path = join(results_path,
                                         '{}.results.json'.format(job_name))

                assert len(grid_scores) == 1
                assert len(grid_search_cv_results_dicts) == 1
                grid_search_cv_results_dict = {"grid_score": grid_scores[0]}
                grid_search_cv_results_dict["grid_search_cv_results"] = \
                    grid_search_cv_results_dicts[0]
                grid_search_cv_results_dict.update(learner_result_dict_base)
                # write out the result dictionary to a json file
                with open(results_json_path, 'w') as json_file:
                    json.dump(grid_search_cv_results_dict,
                              json_file,
                              cls=NumpyTypeEncoder)
            res = [learner_result_dict_base]

        # write out the cv folds if required
        if task == 'cross_validate' and save_cv_folds:
            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
            with open(join(results_path, skll_fold_ids_file),
                      'w') as output_file:
                _write_skll_folds(skll_fold_ids, output_file)

    finally:
        close_and_remove_logger_handlers(logger)

    return res
コード例 #19
0
ファイル: experiments.py プロジェクト: ClimbsRocks/skll
def _classify_featureset(args):
    """ Classification job to be submitted to grid """
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    quiet = args.pop('quiet', False)

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    with open(log_path, 'w') as log_file:
        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating ({} folds) on {}, feature " +
                   "set {} ...").format(cv_folds, train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(("Training on {}, Test on {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(train_set_name,
                                                              featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if task == 'cross_validate' or (not exists(modelfile) or
                                        overwrite):
            train_examples = _load_featureset(train_path, featureset, suffix,
                                              label_col=label_col,
                                              id_col=id_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map,
                                              feature_hasher=feature_hasher,
                                              num_features=hasher_features)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path)
        # load the model if it already exists
        else:
            # import the custom learner path here in case we are reusing a
            # saved model
            if custom_learner_path:
                _import_custom_learner(custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                print(('\tloading pre-existing %s model: %s') % (learner_name,
                                                                 modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path, featureset, suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet, class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # create a list of dictionaries of the results information
        learner_result_dict_base = {'experiment_name': experiment_name,
                                    'train_set_name': train_set_name,
                                    'train_set_size': train_set_size,
                                    'test_set_name': test_set_name,
                                    'test_set_size': test_set_size,
                                    'featureset': json.dumps(featureset),
                                    'featureset_name': featureset_name,
                                    'shuffle': shuffle,
                                    'learner_name': learner_name,
                                    'task': task,
                                    'start_timestamp':
                                    start_timestamp.strftime('%d %b %Y %H:%M:'
                                                             '%S.%f'),
                                    'version': __version__,
                                    'feature_scaling': feature_scaling,
                                    'grid_search': grid_search,
                                    'grid_objective': grid_objective,
                                    'grid_search_folds': grid_search_folds,
                                    'min_feature_count': min_feature_count,
                                    'cv_folds': cv_folds,
                                    'stratified_folds': stratified_folds,
                                    'scikit_learn_version': SCIKIT_VERSION}

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores = learner.cross_validate(
                train_examples, shuffle=shuffle, stratified=stratified_folds,
                prediction_prefix=prediction_prefix, grid_search=grid_search,
                grid_search_folds=grid_search_folds, cv_folds=cv_folds,
                grid_objective=grid_objective, param_grid=param_grid,
                grid_jobs=grid_search_jobs)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           shuffle=shuffle,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    # note: bankers' rounding is used in python 3,
                    # so these scores may be different between runs in
                    # python 2 and 3 at the final decimal place.
                    print('\tbest {} grid search score: {}'
                          .format(grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         iteritems(learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [learner.evaluate(
                    test_examples, prediction_prefix=prediction_prefix,
                    grid_objective=grid_objective)]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path,
                           '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

    return res
コード例 #20
0
ファイル: experiments.py プロジェクト: wavelets/skll
def _classify_featureset(args):
    ''' Classification job to be submitted to grid '''
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    cv_folds = args.pop("cv_folds")
    label_col = args.pop("label_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    quiet = args.pop('quiet', False)
    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: " +
                          "{}").format(args.keys()))

    timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S')

    with open(log_path, 'w') as log_file:

        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating on {}, feature " +
                   "set {} ...").format(train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(("Training on {}, Test on {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(train_set_name,
                                                              featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = os.path.join(model_path, '{}.model'.format(job_name))

        # load the training and test examples
        if task == 'cross_validate' or (not os.path.exists(modelfile) or
                                        overwrite):
            train_examples = _load_featureset(train_path, featureset, suffix,
                                              label_col=label_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map)
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count)
        # load the model if it already exists
        else:
            if os.path.exists(modelfile) and not overwrite:
                print(('\tloading pre-existing {} ' +
                       'model: {}').format(learner_name, modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path, featureset, suffix,
                                             label_col=label_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet, class_map=class_map,
                                             unlabelled=True)


        # create a list of dictionaries of the results information
        learner_result_dict_base = {'experiment_name': experiment_name,
                                    'train_set_name': train_set_name,
                                    'test_set_name': test_set_name,
                                    'featureset': json.dumps(featureset),
                                    'learner_name': learner_name,
                                    'task': task,
                                    'timestamp': timestamp,
                                    'version': __version__,
                                    'feature_scaling': feature_scaling,
                                    'grid_search': grid_search,
                                    'grid_objective': grid_objective,
                                    'min_feature_count': min_feature_count}

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores = learner.cross_validate(train_examples,
                                                               prediction_prefix=prediction_prefix,
                                                               grid_search=grid_search,
                                                               cv_folds=cv_folds,
                                                               grid_objective=grid_objective,
                                                               param_grid=param_grid,
                                                               grid_jobs=grid_search_jobs)
        else:
            # if we have do not have a saved model, we need to train one.
            if not os.path.exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                grid_search_folds = 5
                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    print('\tbest {} grid search score: {}'
                          .format(grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         iteritems(learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [learner.evaluate(
                    test_examples, prediction_prefix=prediction_prefix,
                    grid_objective=grid_objective)]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = os.path.join(results_path,
                                             '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file)

            with open(os.path.join(results_path, '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

    return res
コード例 #21
0
ファイル: experiments.py プロジェクト: nineil/skll
def _classify_featureset(args):
    ''' Classification job to be submitted to grid '''
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    cv_folds = args.pop("cv_folds")
    label_col = args.pop("label_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    quiet = args.pop('quiet', False)
    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: " +
                          "{}").format(args.keys()))

    timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S')

    with open(log_path, 'w') as log_file:

        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating on {}, feature " + "set {} ...").format(
                train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(
                ("Training on {}, Test on {}, " + "feature set {} ...").format(
                    train_set_name, test_set_name, featureset),
                file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(
                train_set_name, featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = os.path.join(model_path, '{}.model'.format(job_name))

        # load the training and test examples
        if task == 'cross_validate' or (not os.path.exists(modelfile)
                                        or overwrite):
            train_examples = _load_featureset(train_path,
                                              featureset,
                                              suffix,
                                              label_col=label_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet,
                                              class_map=class_map)
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count)
        # load the model if it already exists
        else:
            if os.path.exists(modelfile) and not overwrite:
                print(('\tloading pre-existing {} ' + 'model: {}').format(
                    learner_name, modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path,
                                             featureset,
                                             suffix,
                                             label_col=label_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet,
                                             class_map=class_map,
                                             unlabelled=True)

        # create a list of dictionaries of the results information
        learner_result_dict_base = {
            'experiment_name': experiment_name,
            'train_set_name': train_set_name,
            'test_set_name': test_set_name,
            'featureset': json.dumps(featureset),
            'learner_name': learner_name,
            'task': task,
            'timestamp': timestamp,
            'version': __version__,
            'feature_scaling': feature_scaling,
            'grid_search': grid_search,
            'grid_objective': grid_objective,
            'min_feature_count': min_feature_count
        }

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores = learner.cross_validate(
                train_examples,
                prediction_prefix=prediction_prefix,
                grid_search=grid_search,
                cv_folds=cv_folds,
                grid_objective=grid_objective,
                param_grid=param_grid,
                grid_jobs=grid_search_jobs)
        else:
            # if we have do not have a saved model, we need to train one.
            if not os.path.exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                grid_search_folds = 5
                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    # note: bankers' rounding is used in python 3,
                    # so these scores may be different between runs in
                    # python 2 and 3 at the final decimal place.
                    print('\tbest {} grid search score: {}'.format(
                        grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in iteritems(
                             learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [
                    learner.evaluate(test_examples,
                                     prediction_prefix=prediction_prefix,
                                     grid_objective=grid_objective)
                ]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = os.path.join(
                results_path, '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file)

            with open(
                    os.path.join(results_path, '{}.results'.format(job_name)),
                    'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

    return res
コード例 #22
0
def _classify_featureset(args):
    """ Classification job to be submitted to grid """
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    save_cv_folds = args.pop("save_cv_folds")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    quiet = args.pop('quiet', False)

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    with open(log_path, 'w') as log_file:
        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating ({} folds) on {}, feature " +
                   "set {} ...").format(cv_folds, train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(("Training on {}, Test on {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(train_set_name,
                                                              featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if task == 'cross_validate' or (not exists(modelfile) or
                                        overwrite):
            train_examples = _load_featureset(train_path, featureset, suffix,
                                              label_col=label_col,
                                              id_col=id_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map,
                                              feature_hasher=feature_hasher,
                                              num_features=hasher_features)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path)
        # load the model if it already exists
        else:
            # import the custom learner path here in case we are reusing a
            # saved model
            if custom_learner_path:
                _import_custom_learner(custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                print(('\tloading pre-existing %s model: %s') % (learner_name,
                                                                 modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path, featureset, suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet, class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # create a list of dictionaries of the results information
        learner_result_dict_base = {'experiment_name': experiment_name,
                                    'train_set_name': train_set_name,
                                    'train_set_size': train_set_size,
                                    'test_set_name': test_set_name,
                                    'test_set_size': test_set_size,
                                    'featureset': json.dumps(featureset),
                                    'featureset_name': featureset_name,
                                    'shuffle': shuffle,
                                    'learner_name': learner_name,
                                    'task': task,
                                    'start_timestamp':
                                    start_timestamp.strftime('%d %b %Y %H:%M:'
                                                             '%S.%f'),
                                    'version': __version__,
                                    'feature_scaling': feature_scaling,
                                    'grid_search': grid_search,
                                    'grid_objective': grid_objective,
                                    'grid_search_folds': grid_search_folds,
                                    'min_feature_count': min_feature_count,
                                    'cv_folds': cv_folds,
                                    'save_cv_folds': save_cv_folds,
                                    'stratified_folds': stratified_folds,
                                    'scikit_learn_version': SCIKIT_VERSION}

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores, skll_fold_ids = learner.cross_validate(
                train_examples, shuffle=shuffle, stratified=stratified_folds,
                prediction_prefix=prediction_prefix, grid_search=grid_search,
                grid_search_folds=grid_search_folds, cv_folds=cv_folds,
                grid_objective=grid_objective, param_grid=param_grid,
                grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           shuffle=shuffle,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    # note: bankers' rounding is used in python 3,
                    # so these scores may be different between runs in
                    # python 2 and 3 at the final decimal place.
                    print('\tbest {} grid search score: {}'
                          .format(grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         iteritems(learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [learner.evaluate(
                    test_examples, prediction_prefix=prediction_prefix,
                    grid_objective=grid_objective)]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path,
                           '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

        # write out the cv folds if required
        if task == 'cross_validate' and save_cv_folds:
            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(join(results_path, skll_fold_ids_file),
                      file_mode) as output_file:
                _write_skll_folds(skll_fold_ids, output_file)

    return res