def test_learner_api_load_into_existing_instance(): """ Check that `Learner.load()` works as expected """ # create a LinearSVC instance and train it on some data learner1 = Learner('LinearSVC') (train_fs, test_fs) = make_classification_data(num_examples=200, num_features=5, use_feature_hashing=False, non_negative=True) learner1.train(train_fs, grid_search=False) # now use `load()` to replace the existing instance with a # different saved learner other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0])) learner1.load(other_model_file) # now load the saved model into another instance using the class method # `from_file()` learner2 = Learner.from_file(other_model_file) # check that the two instances are now basically the same eq_(learner1.model_type, learner2.model_type) eq_(learner1.model_params, learner2.model_params) eq_(learner1.model_kwargs, learner2.model_kwargs)
def __init__(self, model_path, threshold=None, positive_label=1): """ Initialize the predictor. Parameters ---------- model_path : str Path to use when loading trained model. threshold : float, optional If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. Defaults to ``None``. positive_label : int, optional If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. Defaults to 1. """ self._learner = Learner.from_file(model_path) # garyfeng: fixing error msg "AttributeError: 'Learner' object has no attribute 'logger'" # by passing to the learner the root logger self._learner.logger = logging.getLogger(__name__) self._pos_index = positive_label self.threshold = threshold
def test_learner_api_load_into_existing_instance(): """ Check that `Learner.load()` works as expected """ # create a LinearSVC instance and train it on some data learner1 = Learner('LinearSVC') (train_fs, test_fs) = make_classification_data(num_examples=200, num_features=5, use_feature_hashing=False, non_negative=True) learner1.train(train_fs, grid_search=False) # now use `load()` to replace the existing instance with a # different saved learner other_model_file = join( _my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0])) learner1.load(other_model_file) # now load the saved model into another instance using the class method # `from_file()` learner2 = Learner.from_file(other_model_file) # check that the two instances are now basically the same eq_(learner1.model_type, learner2.model_type) eq_(learner1.model_params, learner2.model_params) eq_(learner1.model_kwargs, learner2.model_kwargs)
def __init__(self, model_path, threshold=None, positive_label=1, logger=None): """ Initialize the predictor. Parameters ---------- model_path : str Path to use when loading trained model. threshold : float, optional If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. Defaults to ``None``. positive_label : int, optional If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. Defaults to 1. logger : logging object, optional A logging object. If ``None`` is passed, get logger from ``__name__``. Defaults to ``None``. """ # self.logger = logger if logger else logging.getLogger(__name__) self._learner = Learner.from_file(model_path) self._pos_index = positive_label self.threshold = threshold
def load_model(self, model_path): """ Load the parser model from disk. Parameters ---------- model_path : str Path to the directory containing the model file. The model name is always ``rst_parsing_all_feats_LogisticRegression.model``. """ model_name = "rst_parsing_all_feats_LogisticRegression.model" self.model = Learner.from_file(join(model_path, model_name))
def test_backward_compatibility(): ''' Verify that a model from v0.9.17 can still be loaded and generate the same predictions. ''' predict_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary_test_summary_LogisticRegression.predictions') model_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary_test_summary_LogisticRegression.{}.model'.format(sys.version_info[0])) test_path = os.path.join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary.jsonlines') learner = Learner.from_file(model_path) examples = load_examples(test_path, quiet=True) new_predictions = learner.predict(examples)[:, 1] with open(predict_path) as predict_file: for line, new_val in zip(predict_file, new_predictions): assert_almost_equal(float(line.strip()), new_val)
def update_model(model_file): """Read in the model file and save it again.""" model_dir = dirname(model_file) # get the list of current files so that we can # remove them later to ensure there are no stranded # .npy files npy_files = glob.glob(join(model_dir, '*.npy')) # now load the SKLL model model = Learner.from_file(model_file) # delete the existing npy files. The model file will get overwritten, # but we do not know the exact number of current .npy files. for npy_file in npy_files: remove(npy_file) model.save(model_file)
def __init__(self, model_path, threshold=None, positive_label_index=1, all_labels=False, logger=None): """ Initialize the predictor. Parameters ---------- model_path : str Path to use when loading trained model. threshold : float, optional If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. Defaults to ``None``. positive_label_index : int, optional If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. Defaults to 1. all_labels: bool, optional A flag indicating whether to return the probabilities for all labels in each row instead of just returning the probability of `positive_label`. Defaults to None. logger : logging object, optional A logging object. If ``None`` is passed, get logger from ``__name__``. Defaults to ``None``. """ # self.logger = logger if logger else logging.getLogger(__name__) if threshold is not None and all_labels: raise ValueError("`threshold` and `all_labels` are mutually " "exclusive. They can not both be set to True.") self._learner = Learner.from_file(model_path) self._pos_index = positive_label_index self.threshold = threshold self.all_labels = all_labels self.output_file_header = None
def __init__(self, model_path, threshold=None, positive_label=1): """ Initialize the predictor. :param model_path: Path to use when loading trained model. :type model_path: str :param threshold: If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. :type threshold: float :param positive_label: If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. :type positive_label: int """ self._learner = Learner.from_file(model_path) self._pos_index = positive_label self.threshold = threshold
def minimize_model(model_path, model_name): ''' This function minimizes the model by removing information about features that get weights of 0. ''' model = Learner.from_file(os.path.join(model_path, model_name)) # Take out coefficients for features that are 0 for all classes. nonzero_feat_mask = ~np.all(model.model.coef_ == 0, axis=0) model.model.coef_ = model.model.coef_[:, nonzero_feat_mask] # Remove the extra words from the feat vectorizer. model.feat_vectorizer.restrict(nonzero_feat_mask) # Refit the feature selector to expect the correct size matrices. model.feat_selector.fit(np.ones((1, model.model.coef_.shape[1]))) # Make the feature vectorizer return dense matrices (this is a bit faster). model.feat_vectorizer.set_params(sparse=False) # Delete the raw_coef_ attribute that sklearn *only* uses when training. model.model.raw_coef_ = None # Save the minimized model. model.save(os.path.join(model_path, model_name))
def test_backward_compatibility(): """ Test to validate backward compatibility """ predict_path = join(_my_dir, 'backward_compatibility', ('v0.9.17_test_summary_test_summary_' 'LogisticRegression.predictions')) model_path = join(_my_dir, 'backward_compatibility', ('v0.9.17_test_summary_test_summary_LogisticRegression.' '{}.model').format(sys.version_info[0])) test_path = join(_my_dir, 'backward_compatibility', 'v0.9.17_test_summary.jsonlines') learner = Learner.from_file(model_path) examples = Reader.for_path(test_path, quiet=True).read() new_predictions = learner.predict(examples)[:, 1] with open(predict_path) as predict_file: old_predictions = [float(line.strip()) for line in predict_file] assert_almost_equal(new_predictions, old_predictions)
def __init__(self, model_path, threshold=None, positive_label=1, all_labels=False, logger=None): """ Initialize the predictor. Parameters ---------- model_path : str Path to use when loading trained model. threshold : float, optional If the model we're using is generating probabilities of the positive label, return 1 if it meets/exceeds the given threshold and 0 otherwise. Defaults to ``None``. positive_label : int, optional If the model is only being used to predict the probability of a particular class, this specifies the index of the class we're predicting. 1 = second class, which is default for binary classification. Defaults to 1. all_labels: bool, optional A flag indicating whether to return the probabilities for all labels in each row instead of just returning the probability of `positive_label`. Defaults to None. logger : logging object, optional A logging object. If ``None`` is passed, get logger from ``__name__``. Defaults to ``None``. """ # self.logger = logger if logger else logging.getLogger(__name__) if threshold is not None and all_labels: raise ValueError("`threshold` and `all_labels` are mutually " "exclusive. They can not both be set to True.") self._learner = Learner.from_file(model_path) self._pos_index = positive_label self.threshold = threshold self.all_labels = all_labels self.output_file_header = None
def _classify_featureset(args): """ Classification job to be submitted to grid. Parameters ---------- args : dict A dictionary with arguments for classifying the ``FeatureSet`` instance. Returns ------- res : list of dicts The results of the classification, in the format of a list of dictionaries. Raises ------ ValueError If extra unknown arguments are passed to the function. """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") output_metrics = args.pop("output_metrics") suffix = args.pop("suffix") job_log_file = args.pop("log_file") job_log_level = args.pop("log_level") probability = args.pop("probability") pipeline = args.pop("pipeline") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") folds_file = args.pop("folds_file") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") save_cv_models = args.pop("save_cv_models") use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") custom_metric_path = args.pop("custom_metric_path") quiet = args.pop('quiet', False) learning_curve_cv_folds = args.pop("learning_curve_cv_folds") learning_curve_train_sizes = args.pop("learning_curve_train_sizes") if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() # create a new SKLL logger for this specific job and # use the given log level logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level) try: # log messages logger.info("Task: {}".format(task)) # check if we have any possible custom metrics possible_custom_metric_names = [] for metric_name in output_metrics + [grid_objective]: # metrics that are not in `SCORERS` or `None` are candidates # (the `None` is a by-product of how jobs with single tuning # objectives are created) if metric_name not in SCORERS and metric_name is not None: possible_custom_metric_names.append(metric_name) # if the metric is already in `SCORERS`, is it a custom one # that we already registered? if so, log that elif metric_name in _CUSTOM_METRICS: logger.info( f"custom metric '{metric_name}' is already registered") # initialize list that will hold any invalid metrics # that we could not register as custom metrics invalid_metric_names = [] # if we have possible custom metrics if possible_custom_metric_names: # check that we have a file to load them from if not custom_metric_path: raise ValueError( f"invalid metrics specified: {possible_custom_metric_names}" ) else: # try to register each possible custom metric # raise an exception if we fail, if we don't then # add the custom metric function to `globals()` so # that it serializes properly for gridmap for custom_metric_name in possible_custom_metric_names: try: custom_metric_func = register_custom_metric( custom_metric_path, custom_metric_name) except (AttributeError, NameError, ValueError): invalid_metric_names.append(custom_metric_name) else: logger.info(f"registered '{custom_metric_name}' as a " f"custom metric") globals()[custom_metric_name] = custom_metric_func # raise an error if we have any invalid metrics if invalid_metric_names: raise ValueError( f"invalid metrics specified: {invalid_metric_names}. " f"If these are custom metrics, check the function " f"names.") if task == 'cross_validate': if isinstance(cv_folds, int): num_folds = cv_folds else: # folds_file was used, so count the unique fold ids. num_folds = len(set(cv_folds.values())) logger.info("Cross-validating ({} folds) on {}, feature " "set {} ...".format(num_folds, train_set_name, featureset)) elif task == 'evaluate': logger.info("Training on {}, Test on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) elif task == 'train': logger.info("Training on {}, feature set {} ...".format( train_set_name, featureset)) elif task == 'learning_curve': logger.info("Generating learning curve " "({} 80/20 folds, sizes={}, objective={}) on {}, " "feature set {} ...".format( learning_curve_cv_folds, learning_curve_train_sizes, grid_objective, train_set_name, featureset)) else: # predict logger.info("Training on {}, Making predictions on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if (task in ['cross_validate', 'learning_curve'] or not exists(modelfile) or overwrite): train_examples = load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features, logger=logger) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, pipeline=pipeline, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path, logger=logger) # load the model if it already exists else: # import custom learner into global namespace if we are reusing # a saved model if custom_learner_path: globals()[learner_name] = load_custom_learner( custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: logger.info("Loading pre-existing {} model: {}".format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # attach the job logger to this learner learner.logger = logger # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # compute information about xval and grid folds that can be put in results # in readable form if isinstance(cv_folds, dict): cv_folds_to_print = '{} via folds file'.format( len(set(cv_folds.values()))) else: cv_folds_to_print = str(cv_folds) if isinstance(grid_search_folds, dict): grid_search_folds_to_print = \ '{} via folds file'.format(len(set(grid_search_folds.values()))) else: grid_search_folds_to_print = str(grid_search_folds) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'folds_file': folds_file, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds_to_print, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds_to_print, 'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict), 'save_cv_folds': save_cv_folds, 'save_cv_models': save_cv_models, 'use_folds_file_for_grid_search': use_folds_file_for_grid_search, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': logger.info('Cross-validating') ( task_results, grid_scores, grid_search_cv_results_dicts, skll_fold_ids, models ) = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, output_metrics=output_metrics, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds, save_cv_models=save_cv_models, use_custom_folds_for_grid_search=use_folds_file_for_grid_search ) if models: for index, m in enumerate(models, start=1): modelfile = join(model_path, '{}_fold{}.model'.format(job_name, index)) m.save(modelfile) elif task == 'learning_curve': logger.info("Generating learning curve(s)") (curve_train_scores, curve_test_scores, computed_curve_train_sizes) = learner.learning_curve( train_examples, grid_objective, cv_folds=learning_curve_cv_folds, train_sizes=learning_curve_train_sizes) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: logger.info("Featurizing and training new {} model".format( learner_name)) (best_score, grid_search_cv_results) = learner.train( train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] grid_search_cv_results_dicts = [grid_search_cv_results] # save model if model_path: learner.save(modelfile) if grid_search: logger.info("Best {} grid search score: {}".format( grid_objective, round(best_score, 3))) else: grid_scores = [None] grid_search_cv_results_dicts = [None] # print out the parameters param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in learner.model.get_params().items()) logger.info("Hyperparameters: {}".format(', '.join(param_out))) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': logger.info("Evaluating predictions") task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective, output_metrics=output_metrics) ] elif task == 'predict': logger.info("Writing predictions") # we set `class_labels` to `False` so that if the learner is # probabilistic, probabilities are written instead of labels learner.predict(test_examples, prediction_prefix=prediction_prefix, class_labels=False) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, grid_search_cv_results_dicts, learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) elif task == 'learning_curve': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = {} res.update(learner_result_dict_base) res.update({ 'learning_curve_cv_folds': learning_curve_cv_folds, 'given_curve_train_sizes': learning_curve_train_sizes, 'learning_curve_train_scores_means': np.mean(curve_train_scores, axis=1), 'learning_curve_test_scores_means': np.mean(curve_test_scores, axis=1), 'learning_curve_train_scores_stds': np.std(curve_train_scores, axis=1, ddof=1), 'learning_curve_test_scores_stds': np.std(curve_test_scores, axis=1, ddof=1), 'computed_curve_train_sizes': computed_curve_train_sizes }) # we need to return and write out a list of dictionaries res = [res] # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) # For all other tasks, i.e. train or predict else: if results_path: results_json_path = join(results_path, '{}.results.json'.format(job_name)) assert len(grid_scores) == 1 assert len(grid_search_cv_results_dicts) == 1 grid_search_cv_results_dict = {"grid_score": grid_scores[0]} grid_search_cv_results_dict["grid_search_cv_results"] = \ grid_search_cv_results_dicts[0] grid_search_cv_results_dict.update(learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder) res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' with open(join(results_path, skll_fold_ids_file), 'w') as output_file: _write_skll_folds(skll_fold_ids, output_file) finally: close_and_remove_logger_handlers(logger) return res
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Loads a trained model and outputs predictions based \ on input feature files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, conflict_handler='resolve') parser.add_argument('model_file', help='Model file to load and use for generating ' 'predictions.') parser.add_argument('input_files', help='A space-separated list of CSV, TSV, or ' 'jsonlines files (with or without the label ' 'column), with the appropriate suffix.', nargs='+') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance ' 'IDs in ARFF, CSV, or TSV files.', default='id') parser.add_argument('-l', '--label_col', help='Name of the column which contains the labels ' 'in ARFF, CSV, or TSV files. For ARFF files, ' 'this must be the final column to count as the ' 'label.', default='y') group = parser.add_mutually_exclusive_group() group.add_argument('-p', '--predict_labels', help="If the model is doing probabilistic " "classification, output the class label " "with the highest probability instead of " "the class probabilities.", action='store_true', default=False) group.add_argument('-t', '--threshold', help="If the model we're using is " "doing probabilistic binary " "classification, output the positive " "class label if its probability" "meets/exceeds this threshold" "and output the negative class " "label otherwise.", type=float) parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('-o', '--output_file', help="Path to output tsv file. If not specified, " "predictions will be printed to stdout. For " "probabilistic binary classification, the " "probability of the positive class will " "always be in the last column.") parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) logger = logging.getLogger(__name__) # load the model from disk learner = Learner.from_file(args.model_file) # is the model a regressor or a classifier? estimator_type = learner.model_type._estimator_type # if we are using a binary classification model, get the positive # class label string if from the `pos_label_str` attribute, and if # that is `None`, get it from the learner's label dictionary; also # get the string denoting the negative label which is just the other # label in the list if estimator_type == 'classifier': if len(learner.label_list) == 2: if learner.pos_label_str is not None: pos_label_str = learner.pos_label_str else: pos_label_str = [ label for label in learner.label_dict if learner.label_dict[label] == 1 ][0] neg_label_str = [ label for label in learner.label_list if label != pos_label_str ][0] logger.info("{} is the label for the positive class.") # if we want to choose labels by thresholding the probabilities, # make sure that the learner is probabilistic AND binary first is_probabilistic_classifier = hasattr( learner._model, 'predict_proba') and learner.probability if (args.threshold is not None and (not is_probabilistic_classifier or len(learner.label_list) != 2)): error_msg = ('Cannot threshold probabilities to predict ' 'positive class since given {} learner is ' 'either multi-class, non-probabilistic, or ' 'was not trained with probability=True' '.'.format(learner._model_type.__name__)) logger.error(error_msg) raise ValueError(error_msg) # if we want to choose labels by predicting the most likely label, # make sure that the learner is probabilistic if args.predict_labels and not is_probabilistic_classifier: error_msg = ('Cannot predict most likely labels from probabilities ' 'since given {} learner is either non-probabilistic or ' 'was not trained with probability=True' '.'.format(learner._model_type.__name__)) logger.error(error_msg) raise ValueError(error_msg) # iterate over all the specified input files for i, input_file in enumerate(args.input_files): # make sure each file extension is one we can process input_extension = os.path.splitext(input_file)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, ' '.jsonlines, .libsvm, .megam, .ndj, or .tsv format. ' ' Skipping file {}').format(input_file)) continue else: # read in the file into a featureset reader = EXT_TO_READER[input_extension](input_file, quiet=args.quiet, label_col=args.label_col, id_col=args.id_col) feature_set = reader.read() # for this featureset, get the predictions of either the # most likely class labels or the class label probabilities; # if the model is a regressor then `class_labels` will be # ignored entirely original_predictions = learner.predict( feature_set, class_labels=not learner.probability or args.predict_labels) # get the appropriate header depending on the what we will # be outputting; if we are using a regressor or a non-probabilistic # learner, or thresholding probabilities, or predicting most likely # labels, we are outputting only two columns - the ID and the label, # otherwise we are outputting N + 1 columns where N = number of classes if (estimator_type == 'regressor' or not learner.probability or args.predict_labels or args.threshold is not None): header = ["id", "prediction"] else: header = ["id"] + [str(x) for x in learner.label_list] # now let us start computing what we want to output based # on the predictions we have so far # Threshold the positive class label probability if args.threshold is not None: predictions = [] for neg_label_prob, pos_label_prob in original_predictions: chosen_label = pos_label_str if pos_label_prob >= args.threshold else neg_label_str predictions.append(chosen_label) # For everything else, we can just use the original predictions else: predictions = original_predictions # now initialize the output file outputfh = open(args.output_file, 'a') if args.output_file else sys.stdout # write out the header first but only once if i == 0: print("\t".join(header), file=outputfh) # and now write out the predictions for j, prediction in enumerate(predictions): id_ = feature_set.ids[j] prediction_str = "\t".join( [str(p) for p in prediction]) if isinstance( prediction, (np.ndarray, list)) else prediction print("{}\t{}".format(id_, prediction_str), file=outputfh) # close the file if we had opened one if args.output_file: outputfh.close()
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'save_cv_folds': save_cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores, skll_fold_ids = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, skll_fold_ids_file), file_mode) as output_file: _write_skll_folds(skll_fold_ids, output_file) return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format(train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format(learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate(train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open(os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format( train_set_name, featureset), file=log_file) elif task == 'evaluate': print( ("Training on {}, Test on {}, " + "feature set {} ...").format( train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format( train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}'.format( grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems( learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective) ] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join( results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open( os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res