def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs,
     test_fs) = make_classification_data(num_examples=200,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)
Exemple #2
0
    def __init__(self, model_path, threshold=None, positive_label=1):
        """
        Initialize the predictor.

        Parameters
        ----------
        model_path : str
            Path to use when loading trained model.
        threshold : float, optional
            If the model we're using is generating probabilities
            of the positive label, return 1 if it meets/exceeds
            the given threshold and 0 otherwise.
            Defaults to ``None``.
        positive_label : int, optional
            If the model is only being used to predict the
            probability of a particular class, this
            specifies the index of the class we're
            predicting. 1 = second class, which is default
            for binary classification.
            Defaults to 1.
        """
        self._learner = Learner.from_file(model_path)
        # garyfeng: fixing error msg "AttributeError: 'Learner' object has no attribute 'logger'"
        #     by passing to the learner the root logger
        self._learner.logger = logging.getLogger(__name__)
        self._pos_index = positive_label
        self.threshold = threshold
def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs, test_fs) = make_classification_data(num_examples=200,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(
        _my_dir, 'other',
        'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)
    def __init__(self,
                 model_path,
                 threshold=None,
                 positive_label=1,
                 logger=None):
        """
        Initialize the predictor.

        Parameters
        ----------
        model_path : str
            Path to use when loading trained model.
        threshold : float, optional
            If the model we're using is generating probabilities
            of the positive label, return 1 if it meets/exceeds
            the given threshold and 0 otherwise.
            Defaults to ``None``.
        positive_label : int, optional
            If the model is only being used to predict the
            probability of a particular class, this
            specifies the index of the class we're
            predicting. 1 = second class, which is default
            for binary classification.
            Defaults to 1.
        logger : logging object, optional
            A logging object. If ``None`` is passed, get logger from ``__name__``.
            Defaults to ``None``.
        """
        # self.logger = logger if logger else logging.getLogger(__name__)
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_label
        self.threshold = threshold
Exemple #5
0
    def load_model(self, model_path):
        """
        Load the parser model from disk.

        Parameters
        ----------
        model_path : str
            Path to the directory containing the model file.
            The model name is always
            ``rst_parsing_all_feats_LogisticRegression.model``.
        """
        model_name = "rst_parsing_all_feats_LogisticRegression.model"
        self.model = Learner.from_file(join(model_path, model_name))
Exemple #6
0
def test_backward_compatibility():
    '''
    Verify that a model from v0.9.17 can still be loaded and generate the same predictions.
    '''
    predict_path = os.path.join(_my_dir, 'backward_compatibility',
                                'v0.9.17_test_summary_test_summary_LogisticRegression.predictions')
    model_path = os.path.join(_my_dir, 'backward_compatibility',
                              'v0.9.17_test_summary_test_summary_LogisticRegression.{}.model'.format(sys.version_info[0]))
    test_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary.jsonlines')

    learner = Learner.from_file(model_path)
    examples = load_examples(test_path, quiet=True)
    new_predictions = learner.predict(examples)[:, 1]

    with open(predict_path) as predict_file:
        for line, new_val in zip(predict_file, new_predictions):
            assert_almost_equal(float(line.strip()), new_val)
Exemple #7
0
def update_model(model_file):
    """Read in the model file and save it again."""
    model_dir = dirname(model_file)

    # get the list of current files so that we can
    # remove them later to ensure there are no stranded
    # .npy files
    npy_files = glob.glob(join(model_dir, '*.npy'))

    # now load the SKLL model
    model = Learner.from_file(model_file)

    # delete the existing npy files. The model file will get overwritten,
    # but we do not know the exact number of current .npy files.
    for npy_file in npy_files:
        remove(npy_file)

    model.save(model_file)
Exemple #8
0
    def __init__(self,
                 model_path,
                 threshold=None,
                 positive_label_index=1,
                 all_labels=False,
                 logger=None):
        """
        Initialize the predictor.

        Parameters
        ----------
        model_path : str
            Path to use when loading trained model.
        threshold : float, optional
            If the model we're using is generating probabilities
            of the positive label, return 1 if it meets/exceeds
            the given threshold and 0 otherwise.
            Defaults to ``None``.
        positive_label_index : int, optional
            If the model is only being used to predict the
            probability of a particular class, this
            specifies the index of the class we're
            predicting. 1 = second class, which is default
            for binary classification.
            Defaults to 1.
        all_labels: bool, optional
            A flag indicating whether to return the probabilities for all
            labels in each row instead of just returning the probability of
            `positive_label`. Defaults to None.
        logger : logging object, optional
            A logging object. If ``None`` is passed, get logger from ``__name__``.
            Defaults to ``None``.
        """
        # self.logger = logger if logger else logging.getLogger(__name__)
        if threshold is not None and all_labels:
            raise ValueError("`threshold` and `all_labels` are mutually "
                             "exclusive. They can not both be set to True.")

        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_label_index
        self.threshold = threshold
        self.all_labels = all_labels
        self.output_file_header = None
    def __init__(self, model_path, threshold=None, positive_label=1):
        """
        Initialize the predictor.

        :param model_path: Path to use when loading trained model.
        :type model_path: str
        :param threshold: If the model we're using is generating probabilities
                          of the positive label, return 1 if it meets/exceeds
                          the given threshold and 0 otherwise.
        :type threshold: float
        :param positive_label: If the model is only being used to predict the
                               probability of a particular class, this
                               specifies the index of the class we're
                               predicting. 1 = second class, which is default
                               for binary classification.
        :type positive_label: int
        """
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_label
        self.threshold = threshold
def minimize_model(model_path, model_name):
    '''
    This function minimizes the model by removing information about features
    that get weights of 0.
    '''

    model = Learner.from_file(os.path.join(model_path, model_name))
    # Take out coefficients for features that are 0 for all classes.
    nonzero_feat_mask = ~np.all(model.model.coef_ == 0, axis=0)
    model.model.coef_ = model.model.coef_[:, nonzero_feat_mask]
    # Remove the extra words from the feat vectorizer.
    model.feat_vectorizer.restrict(nonzero_feat_mask)
    # Refit the feature selector to expect the correct size matrices.
    model.feat_selector.fit(np.ones((1, model.model.coef_.shape[1])))
    # Make the feature vectorizer return dense matrices (this is a bit faster).
    model.feat_vectorizer.set_params(sparse=False)
    # Delete the raw_coef_ attribute that sklearn *only* uses when training.
    model.model.raw_coef_ = None
    # Save the minimized model.
    model.save(os.path.join(model_path, model_name))
Exemple #11
0
    def __init__(self, model_path, threshold=None, positive_label=1):
        """
        Initialize the predictor.

        :param model_path: Path to use when loading trained model.
        :type model_path: str
        :param threshold: If the model we're using is generating probabilities
                          of the positive label, return 1 if it meets/exceeds
                          the given threshold and 0 otherwise.
        :type threshold: float
        :param positive_label: If the model is only being used to predict the
                               probability of a particular class, this
                               specifies the index of the class we're
                               predicting. 1 = second class, which is default
                               for binary classification.
        :type positive_label: int
        """
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_label
        self.threshold = threshold
Exemple #12
0
def test_backward_compatibility():
    """
    Test to validate backward compatibility
    """
    predict_path = join(_my_dir, 'backward_compatibility',
                        ('v0.9.17_test_summary_test_summary_'
                         'LogisticRegression.predictions'))
    model_path = join(_my_dir, 'backward_compatibility',
                      ('v0.9.17_test_summary_test_summary_LogisticRegression.'
                       '{}.model').format(sys.version_info[0]))
    test_path = join(_my_dir, 'backward_compatibility',
                     'v0.9.17_test_summary.jsonlines')

    learner = Learner.from_file(model_path)
    examples = Reader.for_path(test_path, quiet=True).read()
    new_predictions = learner.predict(examples)[:, 1]

    with open(predict_path) as predict_file:
        old_predictions = [float(line.strip()) for line in predict_file]
    assert_almost_equal(new_predictions, old_predictions)
def minimize_model(model_path, model_name):
    '''
    This function minimizes the model by removing information about features
    that get weights of 0.
    '''

    model = Learner.from_file(os.path.join(model_path, model_name))
    # Take out coefficients for features that are 0 for all classes.
    nonzero_feat_mask = ~np.all(model.model.coef_ == 0, axis=0)
    model.model.coef_ = model.model.coef_[:, nonzero_feat_mask]
    # Remove the extra words from the feat vectorizer.
    model.feat_vectorizer.restrict(nonzero_feat_mask)
    # Refit the feature selector to expect the correct size matrices.
    model.feat_selector.fit(np.ones((1, model.model.coef_.shape[1])))
    # Make the feature vectorizer return dense matrices (this is a bit faster).
    model.feat_vectorizer.set_params(sparse=False)
    # Delete the raw_coef_ attribute that sklearn *only* uses when training.
    model.model.raw_coef_ = None
    # Save the minimized model.
    model.save(os.path.join(model_path, model_name))
Exemple #14
0
def test_backward_compatibility():
    """
    Test to validate backward compatibility
    """
    predict_path = join(_my_dir, 'backward_compatibility',
                        ('v0.9.17_test_summary_test_summary_'
                         'LogisticRegression.predictions'))
    model_path = join(_my_dir, 'backward_compatibility',
                      ('v0.9.17_test_summary_test_summary_LogisticRegression.'
                       '{}.model').format(sys.version_info[0]))
    test_path = join(_my_dir, 'backward_compatibility',
                     'v0.9.17_test_summary.jsonlines')

    learner = Learner.from_file(model_path)
    examples = Reader.for_path(test_path, quiet=True).read()
    new_predictions = learner.predict(examples)[:, 1]

    with open(predict_path) as predict_file:
        old_predictions = [float(line.strip()) for
                           line in predict_file]
    assert_almost_equal(new_predictions, old_predictions)
    def __init__(self, model_path, threshold=None, positive_label=1,
                 all_labels=False, logger=None):
        """
        Initialize the predictor.

        Parameters
        ----------
        model_path : str
            Path to use when loading trained model.
        threshold : float, optional
            If the model we're using is generating probabilities
            of the positive label, return 1 if it meets/exceeds
            the given threshold and 0 otherwise.
            Defaults to ``None``.
        positive_label : int, optional
            If the model is only being used to predict the
            probability of a particular class, this
            specifies the index of the class we're
            predicting. 1 = second class, which is default
            for binary classification.
            Defaults to 1.
        all_labels: bool, optional
            A flag indicating whether to return the probabilities for all
            labels in each row instead of just returning the probability of
            `positive_label`. Defaults to None.
        logger : logging object, optional
            A logging object. If ``None`` is passed, get logger from ``__name__``.
            Defaults to ``None``.
        """
        # self.logger = logger if logger else logging.getLogger(__name__)
        if threshold is not None and all_labels:
            raise ValueError("`threshold` and `all_labels` are mutually "
                             "exclusive. They can not both be set to True.")

        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_label
        self.threshold = threshold
        self.all_labels = all_labels
        self.output_file_header = None
Exemple #16
0
def _classify_featureset(args):
    """
    Classification job to be submitted to grid.

    Parameters
    ----------
    args : dict
        A dictionary with arguments for classifying the
        ``FeatureSet`` instance.

    Returns
    -------
    res : list of dicts
        The results of the classification, in the format
        of a list of dictionaries.

    Raises
    ------
    ValueError
        If extra unknown arguments are passed to the function.
    """

    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)

    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    output_metrics = args.pop("output_metrics")
    suffix = args.pop("suffix")
    job_log_file = args.pop("log_file")
    job_log_level = args.pop("log_level")
    probability = args.pop("probability")
    pipeline = args.pop("pipeline")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    folds_file = args.pop("folds_file")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    save_cv_folds = args.pop("save_cv_folds")
    save_cv_models = args.pop("save_cv_models")
    use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    custom_metric_path = args.pop("custom_metric_path")
    quiet = args.pop('quiet', False)
    learning_curve_cv_folds = args.pop("learning_curve_cv_folds")
    learning_curve_train_sizes = args.pop("learning_curve_train_sizes")

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    # create a new SKLL logger for this specific job and
    # use the given log level
    logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level)

    try:

        # log messages
        logger.info("Task: {}".format(task))

        # check if we have any possible custom metrics
        possible_custom_metric_names = []
        for metric_name in output_metrics + [grid_objective]:
            # metrics that are not in `SCORERS` or `None` are candidates
            # (the `None` is a by-product of how jobs with single tuning
            # objectives are created)
            if metric_name not in SCORERS and metric_name is not None:
                possible_custom_metric_names.append(metric_name)
            # if the metric is already in `SCORERS`, is it a custom one
            # that we already registered? if so, log that
            elif metric_name in _CUSTOM_METRICS:
                logger.info(
                    f"custom metric '{metric_name}' is already registered")

        # initialize list that will hold any invalid metrics
        # that we could not register as custom metrics
        invalid_metric_names = []

        # if we have possible custom metrics
        if possible_custom_metric_names:

            # check that we have a file to load them from
            if not custom_metric_path:
                raise ValueError(
                    f"invalid metrics specified: {possible_custom_metric_names}"
                )
            else:
                # try to register each possible custom metric
                # raise an exception if we fail, if we don't then
                # add the custom metric function to `globals()` so
                # that it serializes properly for gridmap
                for custom_metric_name in possible_custom_metric_names:
                    try:
                        custom_metric_func = register_custom_metric(
                            custom_metric_path, custom_metric_name)
                    except (AttributeError, NameError, ValueError):
                        invalid_metric_names.append(custom_metric_name)
                    else:
                        logger.info(f"registered '{custom_metric_name}' as a "
                                    f"custom metric")
                        globals()[custom_metric_name] = custom_metric_func

        # raise an error if we have any invalid metrics
        if invalid_metric_names:
            raise ValueError(
                f"invalid metrics specified: {invalid_metric_names}. "
                f"If these are custom metrics, check the function "
                f"names.")

        if task == 'cross_validate':
            if isinstance(cv_folds, int):
                num_folds = cv_folds
            else:  # folds_file was used, so count the unique fold ids.
                num_folds = len(set(cv_folds.values()))
            logger.info("Cross-validating ({} folds) on {}, feature "
                        "set {} ...".format(num_folds, train_set_name,
                                            featureset))
        elif task == 'evaluate':
            logger.info("Training on {}, Test on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))
        elif task == 'train':
            logger.info("Training on {}, feature set {} ...".format(
                train_set_name, featureset))
        elif task == 'learning_curve':
            logger.info("Generating learning curve "
                        "({} 80/20 folds, sizes={}, objective={}) on {}, "
                        "feature set {} ...".format(
                            learning_curve_cv_folds,
                            learning_curve_train_sizes, grid_objective,
                            train_set_name, featureset))
        else:  # predict
            logger.info("Training on {}, Making predictions on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if (task in ['cross_validate', 'learning_curve']
                or not exists(modelfile) or overwrite):
            train_examples = load_featureset(train_path,
                                             featureset,
                                             suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet,
                                             class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features,
                                             logger=logger)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              pipeline=pipeline,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path,
                              logger=logger)

        # load the model if it already exists
        else:
            # import custom learner into global namespace if we are reusing
            # a saved model
            if custom_learner_path:
                globals()[learner_name] = load_custom_learner(
                    custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                logger.info("Loading pre-existing {} model: {}".format(
                    learner_name, modelfile))
            learner = Learner.from_file(modelfile)

            # attach the job logger to this learner
            learner.logger = logger

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = load_featureset(test_path,
                                            featureset,
                                            suffix,
                                            label_col=label_col,
                                            id_col=id_col,
                                            ids_to_floats=ids_to_floats,
                                            quiet=quiet,
                                            class_map=class_map,
                                            feature_hasher=feature_hasher,
                                            num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # compute information about xval and grid folds that can be put in results
        # in readable form
        if isinstance(cv_folds, dict):
            cv_folds_to_print = '{} via folds file'.format(
                len(set(cv_folds.values())))
        else:
            cv_folds_to_print = str(cv_folds)

        if isinstance(grid_search_folds, dict):
            grid_search_folds_to_print = \
                '{} via folds file'.format(len(set(grid_search_folds.values())))
        else:
            grid_search_folds_to_print = str(grid_search_folds)

        # create a list of dictionaries of the results information
        learner_result_dict_base = {
            'experiment_name':
            experiment_name,
            'train_set_name':
            train_set_name,
            'train_set_size':
            train_set_size,
            'test_set_name':
            test_set_name,
            'test_set_size':
            test_set_size,
            'featureset':
            json.dumps(featureset),
            'featureset_name':
            featureset_name,
            'shuffle':
            shuffle,
            'learner_name':
            learner_name,
            'task':
            task,
            'start_timestamp':
            start_timestamp.strftime('%d %b %Y %H:%M:'
                                     '%S.%f'),
            'version':
            __version__,
            'feature_scaling':
            feature_scaling,
            'folds_file':
            folds_file,
            'grid_search':
            grid_search,
            'grid_objective':
            grid_objective,
            'grid_search_folds':
            grid_search_folds_to_print,
            'min_feature_count':
            min_feature_count,
            'cv_folds':
            cv_folds_to_print,
            'using_folds_file':
            isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict),
            'save_cv_folds':
            save_cv_folds,
            'save_cv_models':
            save_cv_models,
            'use_folds_file_for_grid_search':
            use_folds_file_for_grid_search,
            'stratified_folds':
            stratified_folds,
            'scikit_learn_version':
            SCIKIT_VERSION
        }

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            logger.info('Cross-validating')
            (
                task_results, grid_scores, grid_search_cv_results_dicts,
                skll_fold_ids, models
            ) = learner.cross_validate(
                train_examples,
                shuffle=shuffle,
                stratified=stratified_folds,
                prediction_prefix=prediction_prefix,
                grid_search=grid_search,
                grid_search_folds=grid_search_folds,
                cv_folds=cv_folds,
                grid_objective=grid_objective,
                output_metrics=output_metrics,
                param_grid=param_grid,
                grid_jobs=grid_search_jobs,
                save_cv_folds=save_cv_folds,
                save_cv_models=save_cv_models,
                use_custom_folds_for_grid_search=use_folds_file_for_grid_search
            )
            if models:
                for index, m in enumerate(models, start=1):
                    modelfile = join(model_path,
                                     '{}_fold{}.model'.format(job_name, index))
                    m.save(modelfile)
        elif task == 'learning_curve':
            logger.info("Generating learning curve(s)")
            (curve_train_scores, curve_test_scores,
             computed_curve_train_sizes) = learner.learning_curve(
                 train_examples,
                 grid_objective,
                 cv_folds=learning_curve_cv_folds,
                 train_sizes=learning_curve_train_sizes)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                logger.info("Featurizing and training new {} model".format(
                    learner_name))

                (best_score, grid_search_cv_results) = learner.train(
                    train_examples,
                    shuffle=shuffle,
                    grid_search=grid_search,
                    grid_search_folds=grid_search_folds,
                    grid_objective=grid_objective,
                    param_grid=param_grid,
                    grid_jobs=grid_search_jobs)
                grid_scores = [best_score]
                grid_search_cv_results_dicts = [grid_search_cv_results]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    logger.info("Best {} grid search score: {}".format(
                        grid_objective, round(best_score, 3)))
            else:
                grid_scores = [None]
                grid_search_cv_results_dicts = [None]

            # print out the parameters
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         learner.model.get_params().items())
            logger.info("Hyperparameters: {}".format(', '.join(param_out)))

            # run on test set or cross-validate on training data,
            # depending on what was asked for
            if task == 'evaluate':
                logger.info("Evaluating predictions")
                task_results = [
                    learner.evaluate(test_examples,
                                     prediction_prefix=prediction_prefix,
                                     grid_objective=grid_objective,
                                     output_metrics=output_metrics)
                ]
            elif task == 'predict':
                logger.info("Writing predictions")
                # we set `class_labels` to `False` so that if the learner is
                # probabilistic, probabilities are written instead of labels
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix,
                                class_labels=False)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               grid_search_cv_results_dicts,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path, '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)

        elif task == 'learning_curve':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = {}
            res.update(learner_result_dict_base)
            res.update({
                'learning_curve_cv_folds':
                learning_curve_cv_folds,
                'given_curve_train_sizes':
                learning_curve_train_sizes,
                'learning_curve_train_scores_means':
                np.mean(curve_train_scores, axis=1),
                'learning_curve_test_scores_means':
                np.mean(curve_test_scores, axis=1),
                'learning_curve_train_scores_stds':
                np.std(curve_train_scores, axis=1, ddof=1),
                'learning_curve_test_scores_stds':
                np.std(curve_test_scores, axis=1, ddof=1),
                'computed_curve_train_sizes':
                computed_curve_train_sizes
            })

            # we need to return and write out a list of dictionaries
            res = [res]

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

        # For all other tasks, i.e. train or predict
        else:
            if results_path:
                results_json_path = join(results_path,
                                         '{}.results.json'.format(job_name))

                assert len(grid_scores) == 1
                assert len(grid_search_cv_results_dicts) == 1
                grid_search_cv_results_dict = {"grid_score": grid_scores[0]}
                grid_search_cv_results_dict["grid_search_cv_results"] = \
                    grid_search_cv_results_dicts[0]
                grid_search_cv_results_dict.update(learner_result_dict_base)
                # write out the result dictionary to a json file
                with open(results_json_path, 'w') as json_file:
                    json.dump(grid_search_cv_results_dict,
                              json_file,
                              cls=NumpyTypeEncoder)
            res = [learner_result_dict_base]

        # write out the cv folds if required
        if task == 'cross_validate' and save_cv_folds:
            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
            with open(join(results_path, skll_fold_ids_file),
                      'w') as output_file:
                _write_skll_folds(skll_fold_ids, output_file)

    finally:
        close_and_remove_logger_handlers(logger)

    return res
Exemple #17
0
def _classify_featureset(args):
    """ Classification job to be submitted to grid """
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    quiet = args.pop('quiet', False)

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    with open(log_path, 'w') as log_file:
        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating ({} folds) on {}, feature " +
                   "set {} ...").format(cv_folds, train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(("Training on {}, Test on {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(train_set_name,
                                                              featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if task == 'cross_validate' or (not exists(modelfile) or
                                        overwrite):
            train_examples = _load_featureset(train_path, featureset, suffix,
                                              label_col=label_col,
                                              id_col=id_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map,
                                              feature_hasher=feature_hasher,
                                              num_features=hasher_features)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path)
        # load the model if it already exists
        else:
            # import the custom learner path here in case we are reusing a
            # saved model
            if custom_learner_path:
                _import_custom_learner(custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                print(('\tloading pre-existing %s model: %s') % (learner_name,
                                                                 modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path, featureset, suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet, class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # create a list of dictionaries of the results information
        learner_result_dict_base = {'experiment_name': experiment_name,
                                    'train_set_name': train_set_name,
                                    'train_set_size': train_set_size,
                                    'test_set_name': test_set_name,
                                    'test_set_size': test_set_size,
                                    'featureset': json.dumps(featureset),
                                    'featureset_name': featureset_name,
                                    'shuffle': shuffle,
                                    'learner_name': learner_name,
                                    'task': task,
                                    'start_timestamp':
                                    start_timestamp.strftime('%d %b %Y %H:%M:'
                                                             '%S.%f'),
                                    'version': __version__,
                                    'feature_scaling': feature_scaling,
                                    'grid_search': grid_search,
                                    'grid_objective': grid_objective,
                                    'grid_search_folds': grid_search_folds,
                                    'min_feature_count': min_feature_count,
                                    'cv_folds': cv_folds,
                                    'stratified_folds': stratified_folds,
                                    'scikit_learn_version': SCIKIT_VERSION}

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores = learner.cross_validate(
                train_examples, shuffle=shuffle, stratified=stratified_folds,
                prediction_prefix=prediction_prefix, grid_search=grid_search,
                grid_search_folds=grid_search_folds, cv_folds=cv_folds,
                grid_objective=grid_objective, param_grid=param_grid,
                grid_jobs=grid_search_jobs)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           shuffle=shuffle,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    # note: bankers' rounding is used in python 3,
                    # so these scores may be different between runs in
                    # python 2 and 3 at the final decimal place.
                    print('\tbest {} grid search score: {}'
                          .format(grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         iteritems(learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [learner.evaluate(
                    test_examples, prediction_prefix=prediction_prefix,
                    grid_objective=grid_objective)]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path,
                           '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

    return res
Exemple #18
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Loads a trained model and outputs predictions based \
                     on input feature files.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve')
    parser.add_argument('model_file',
                        help='Model file to load and use for generating '
                        'predictions.')
    parser.add_argument('input_files',
                        help='A space-separated list of CSV, TSV, or '
                        'jsonlines files (with or without the label '
                        'column), with the appropriate suffix.',
                        nargs='+')
    parser.add_argument('-i',
                        '--id_col',
                        help='Name of the column which contains the instance '
                        'IDs in ARFF, CSV, or TSV files.',
                        default='id')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the labels '
                        'in ARFF, CSV, or TSV files. For ARFF files, '
                        'this must be the final column to count as the '
                        'label.',
                        default='y')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-p',
                       '--predict_labels',
                       help="If the model is doing probabilistic "
                       "classification, output the class label "
                       "with the highest probability instead of "
                       "the class probabilities.",
                       action='store_true',
                       default=False)
    group.add_argument('-t',
                       '--threshold',
                       help="If the model we're using is "
                       "doing probabilistic binary "
                       "classification, output the positive "
                       "class label if its probability"
                       "meets/exceeds this threshold"
                       "and output the negative class "
                       "label otherwise.",
                       type=float)
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('-o',
                        '--output_file',
                        help="Path to output tsv file. If not specified, "
                        "predictions will be printed to stdout. For "
                        "probabilistic binary classification, the "
                        "probability of the positive class will "
                        "always be in the last column.")
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))

    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # load the model from disk
    learner = Learner.from_file(args.model_file)

    # is the model a regressor or a classifier?
    estimator_type = learner.model_type._estimator_type

    # if we are using a binary classification model, get the positive
    # class label string if from the `pos_label_str` attribute, and if
    # that is `None`, get it from the learner's label dictionary; also
    # get the string denoting the negative label which is just the other
    # label in the list
    if estimator_type == 'classifier':
        if len(learner.label_list) == 2:
            if learner.pos_label_str is not None:
                pos_label_str = learner.pos_label_str
            else:
                pos_label_str = [
                    label for label in learner.label_dict
                    if learner.label_dict[label] == 1
                ][0]
            neg_label_str = [
                label for label in learner.label_list if label != pos_label_str
            ][0]
            logger.info("{} is the label for the positive class.")

    # if we want to choose labels by thresholding the probabilities,
    # make sure that the learner is probabilistic AND binary first
    is_probabilistic_classifier = hasattr(
        learner._model, 'predict_proba') and learner.probability
    if (args.threshold is not None and
        (not is_probabilistic_classifier or len(learner.label_list) != 2)):
        error_msg = ('Cannot threshold probabilities to predict '
                     'positive class since given {} learner is '
                     'either multi-class, non-probabilistic, or '
                     'was not trained with probability=True'
                     '.'.format(learner._model_type.__name__))
        logger.error(error_msg)
        raise ValueError(error_msg)

    # if we want to choose labels by predicting the most likely label,
    # make sure that the learner is probabilistic
    if args.predict_labels and not is_probabilistic_classifier:
        error_msg = ('Cannot predict most likely labels from probabilities '
                     'since given {} learner is either non-probabilistic or '
                     'was not trained with probability=True'
                     '.'.format(learner._model_type.__name__))
        logger.error(error_msg)
        raise ValueError(error_msg)

    # iterate over all the specified input files
    for i, input_file in enumerate(args.input_files):

        # make sure each file extension is one we can process
        input_extension = os.path.splitext(input_file)[1].lower()
        if input_extension not in EXT_TO_READER:
            logger.error(('Input file must be in either .arff, .csv, '
                          '.jsonlines, .libsvm, .megam, .ndj, or .tsv format. '
                          ' Skipping file {}').format(input_file))
            continue
        else:
            # read in the file into a featureset
            reader = EXT_TO_READER[input_extension](input_file,
                                                    quiet=args.quiet,
                                                    label_col=args.label_col,
                                                    id_col=args.id_col)
            feature_set = reader.read()

            # for this featureset, get the predictions of either the
            # most likely class labels or the class label probabilities;
            # if the model is a regressor then `class_labels` will be
            # ignored entirely
            original_predictions = learner.predict(
                feature_set,
                class_labels=not learner.probability or args.predict_labels)

            # get the appropriate header depending on the what we will
            # be outputting; if we are using a regressor or a non-probabilistic
            # learner, or thresholding probabilities, or predicting most likely
            # labels, we are outputting only two columns - the ID and the label,
            # otherwise we are outputting N + 1 columns where N = number of classes
            if (estimator_type == 'regressor' or not learner.probability
                    or args.predict_labels or args.threshold is not None):
                header = ["id", "prediction"]
            else:
                header = ["id"] + [str(x) for x in learner.label_list]

            # now let us start computing what we want to output based
            # on the predictions we have so far

            # Threshold the positive class label probability
            if args.threshold is not None:
                predictions = []
                for neg_label_prob, pos_label_prob in original_predictions:
                    chosen_label = pos_label_str if pos_label_prob >= args.threshold else neg_label_str
                    predictions.append(chosen_label)

            # For everything else, we can just use the original predictions
            else:
                predictions = original_predictions

            # now initialize the output file
            outputfh = open(args.output_file,
                            'a') if args.output_file else sys.stdout

            # write out the header first but only once
            if i == 0:
                print("\t".join(header), file=outputfh)

            # and now write out the predictions
            for j, prediction in enumerate(predictions):
                id_ = feature_set.ids[j]
                prediction_str = "\t".join(
                    [str(p) for p in prediction]) if isinstance(
                        prediction, (np.ndarray, list)) else prediction
                print("{}\t{}".format(id_, prediction_str), file=outputfh)

            # close the file if we had opened one
            if args.output_file:
                outputfh.close()
Exemple #19
0
def _classify_featureset(args):
    """ Classification job to be submitted to grid """
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    save_cv_folds = args.pop("save_cv_folds")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    quiet = args.pop('quiet', False)

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    with open(log_path, 'w') as log_file:
        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating ({} folds) on {}, feature " +
                   "set {} ...").format(cv_folds, train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(("Training on {}, Test on {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(train_set_name,
                                                              featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if task == 'cross_validate' or (not exists(modelfile) or
                                        overwrite):
            train_examples = _load_featureset(train_path, featureset, suffix,
                                              label_col=label_col,
                                              id_col=id_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map,
                                              feature_hasher=feature_hasher,
                                              num_features=hasher_features)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path)
        # load the model if it already exists
        else:
            # import the custom learner path here in case we are reusing a
            # saved model
            if custom_learner_path:
                _import_custom_learner(custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                print(('\tloading pre-existing %s model: %s') % (learner_name,
                                                                 modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path, featureset, suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet, class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # create a list of dictionaries of the results information
        learner_result_dict_base = {'experiment_name': experiment_name,
                                    'train_set_name': train_set_name,
                                    'train_set_size': train_set_size,
                                    'test_set_name': test_set_name,
                                    'test_set_size': test_set_size,
                                    'featureset': json.dumps(featureset),
                                    'featureset_name': featureset_name,
                                    'shuffle': shuffle,
                                    'learner_name': learner_name,
                                    'task': task,
                                    'start_timestamp':
                                    start_timestamp.strftime('%d %b %Y %H:%M:'
                                                             '%S.%f'),
                                    'version': __version__,
                                    'feature_scaling': feature_scaling,
                                    'grid_search': grid_search,
                                    'grid_objective': grid_objective,
                                    'grid_search_folds': grid_search_folds,
                                    'min_feature_count': min_feature_count,
                                    'cv_folds': cv_folds,
                                    'save_cv_folds': save_cv_folds,
                                    'stratified_folds': stratified_folds,
                                    'scikit_learn_version': SCIKIT_VERSION}

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores, skll_fold_ids = learner.cross_validate(
                train_examples, shuffle=shuffle, stratified=stratified_folds,
                prediction_prefix=prediction_prefix, grid_search=grid_search,
                grid_search_folds=grid_search_folds, cv_folds=cv_folds,
                grid_objective=grid_objective, param_grid=param_grid,
                grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           shuffle=shuffle,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    # note: bankers' rounding is used in python 3,
                    # so these scores may be different between runs in
                    # python 2 and 3 at the final decimal place.
                    print('\tbest {} grid search score: {}'
                          .format(grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         iteritems(learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [learner.evaluate(
                    test_examples, prediction_prefix=prediction_prefix,
                    grid_objective=grid_objective)]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path,
                           '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

        # write out the cv folds if required
        if task == 'cross_validate' and save_cv_folds:
            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(join(results_path, skll_fold_ids_file),
                      file_mode) as output_file:
                _write_skll_folds(skll_fold_ids, output_file)

    return res
Exemple #20
0
def _classify_featureset(args):
    ''' Classification job to be submitted to grid '''
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    cv_folds = args.pop("cv_folds")
    label_col = args.pop("label_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    quiet = args.pop('quiet', False)
    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: " +
                          "{}").format(args.keys()))

    timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S')

    with open(log_path, 'w') as log_file:

        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating on {}, feature " +
                   "set {} ...").format(train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(("Training on {}, Test on {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(train_set_name,
                                                              featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = os.path.join(model_path, '{}.model'.format(job_name))

        # load the training and test examples
        if task == 'cross_validate' or (not os.path.exists(modelfile) or
                                        overwrite):
            train_examples = _load_featureset(train_path, featureset, suffix,
                                              label_col=label_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map)
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count)
        # load the model if it already exists
        else:
            if os.path.exists(modelfile) and not overwrite:
                print(('\tloading pre-existing {} ' +
                       'model: {}').format(learner_name, modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path, featureset, suffix,
                                             label_col=label_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet, class_map=class_map,
                                             unlabelled=True)


        # create a list of dictionaries of the results information
        learner_result_dict_base = {'experiment_name': experiment_name,
                                    'train_set_name': train_set_name,
                                    'test_set_name': test_set_name,
                                    'featureset': json.dumps(featureset),
                                    'learner_name': learner_name,
                                    'task': task,
                                    'timestamp': timestamp,
                                    'version': __version__,
                                    'feature_scaling': feature_scaling,
                                    'grid_search': grid_search,
                                    'grid_objective': grid_objective,
                                    'min_feature_count': min_feature_count}

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores = learner.cross_validate(train_examples,
                                                               prediction_prefix=prediction_prefix,
                                                               grid_search=grid_search,
                                                               cv_folds=cv_folds,
                                                               grid_objective=grid_objective,
                                                               param_grid=param_grid,
                                                               grid_jobs=grid_search_jobs)
        else:
            # if we have do not have a saved model, we need to train one.
            if not os.path.exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                grid_search_folds = 5
                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    print('\tbest {} grid search score: {}'
                          .format(grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         iteritems(learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [learner.evaluate(
                    test_examples, prediction_prefix=prediction_prefix,
                    grid_objective=grid_objective)]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = os.path.join(results_path,
                                             '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file)

            with open(os.path.join(results_path, '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

    return res
Exemple #21
0
def _classify_featureset(args):
    ''' Classification job to be submitted to grid '''
    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)
    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    suffix = args.pop("suffix")
    log_path = args.pop("log_path")
    probability = args.pop("probability")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    grid_search_jobs = args.pop("grid_search_jobs")
    cv_folds = args.pop("cv_folds")
    label_col = args.pop("label_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    quiet = args.pop('quiet', False)
    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: " +
                          "{}").format(args.keys()))

    timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S')

    with open(log_path, 'w') as log_file:

        # logging
        print("Task:", task, file=log_file)
        if task == 'cross_validate':
            print(("Cross-validating on {}, feature " + "set {} ...").format(
                train_set_name, featureset),
                  file=log_file)
        elif task == 'evaluate':
            print(
                ("Training on {}, Test on {}, " + "feature set {} ...").format(
                    train_set_name, test_set_name, featureset),
                file=log_file)
        elif task == 'train':
            print("Training on {}, feature set {} ...".format(
                train_set_name, featureset),
                  file=log_file)
        else:  # predict
            print(("Training on {}, Making predictions about {}, " +
                   "feature set {} ...").format(train_set_name, test_set_name,
                                                featureset),
                  file=log_file)

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = os.path.join(model_path, '{}.model'.format(job_name))

        # load the training and test examples
        if task == 'cross_validate' or (not os.path.exists(modelfile)
                                        or overwrite):
            train_examples = _load_featureset(train_path,
                                              featureset,
                                              suffix,
                                              label_col=label_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet,
                                              class_map=class_map)
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count)
        # load the model if it already exists
        else:
            if os.path.exists(modelfile) and not overwrite:
                print(('\tloading pre-existing {} ' + 'model: {}').format(
                    learner_name, modelfile))
            learner = Learner.from_file(modelfile)

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = _load_featureset(test_path,
                                             featureset,
                                             suffix,
                                             label_col=label_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet,
                                             class_map=class_map,
                                             unlabelled=True)

        # create a list of dictionaries of the results information
        learner_result_dict_base = {
            'experiment_name': experiment_name,
            'train_set_name': train_set_name,
            'test_set_name': test_set_name,
            'featureset': json.dumps(featureset),
            'learner_name': learner_name,
            'task': task,
            'timestamp': timestamp,
            'version': __version__,
            'feature_scaling': feature_scaling,
            'grid_search': grid_search,
            'grid_objective': grid_objective,
            'min_feature_count': min_feature_count
        }

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            print('\tcross-validating', file=log_file)
            task_results, grid_scores = learner.cross_validate(
                train_examples,
                prediction_prefix=prediction_prefix,
                grid_search=grid_search,
                cv_folds=cv_folds,
                grid_objective=grid_objective,
                param_grid=param_grid,
                grid_jobs=grid_search_jobs)
        else:
            # if we have do not have a saved model, we need to train one.
            if not os.path.exists(modelfile) or overwrite:
                print(('\tfeaturizing and training new ' +
                       '{} model').format(learner_name),
                      file=log_file)

                grid_search_folds = 5
                if not isinstance(cv_folds, int):
                    grid_search_folds = cv_folds

                best_score = learner.train(train_examples,
                                           grid_search=grid_search,
                                           grid_search_folds=grid_search_folds,
                                           grid_objective=grid_objective,
                                           param_grid=param_grid,
                                           grid_jobs=grid_search_jobs)
                grid_scores = [best_score]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    # note: bankers' rounding is used in python 3,
                    # so these scores may be different between runs in
                    # python 2 and 3 at the final decimal place.
                    print('\tbest {} grid search score: {}'.format(
                        grid_objective, round(best_score, 3)),
                          file=log_file)
            else:
                grid_scores = [None]

            # print out the tuned parameters and best CV score
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in iteritems(
                             learner.model.get_params()))
            print('\thyperparameters: {}'.format(', '.join(param_out)),
                  file=log_file)

            # run on test set or cross-validate on training data,
            # depending on what was asked for

            if task == 'evaluate':
                print('\tevaluating predictions', file=log_file)
                task_results = [
                    learner.evaluate(test_examples,
                                     prediction_prefix=prediction_prefix,
                                     grid_objective=grid_objective)
                ]
            elif task == 'predict':
                print('\twriting predictions', file=log_file)
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix)
            # do nothing here for train

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = os.path.join(
                results_path, '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
            with open(results_json_path, file_mode) as json_file:
                json.dump(res, json_file)

            with open(
                    os.path.join(results_path, '{}.results'.format(job_name)),
                    'w') as output_file:
                _print_fancy_output(res, output_file)
        else:
            res = [learner_result_dict_base]

    return res