def _cross_validation_training(feature_maker, feature_names): """Finds the best models for features by cross-validation.""" use_implicationals_values = [True, False] best_configs = collections.defaultdict( lambda: collections.defaultdict(str)) total_num_configs = len(feature_names) * 2 # This excludes the models. n = 1 for feature_name in feature_names: best_models = [] for use_implicationals in use_implicationals_values: logging.info("CV training: %d/%d", n, total_num_configs) # Process training and dev data for the feature. X_train, y_train, _, _, _ = classifier_lib.prepare_data( feature_maker, feature_name, use_implicationals=use_implicationals) # Find the best model for the feature. best_model_info = classifier_lib.select_best_model( classifier_lib.ALL_MODELS, feature_name, X_train, y_train, FLAGS.cv_num_folds, FLAGS.cv_num_repeats) best_models.append((use_implicationals, best_model_info)) n += 1 # Select the best model with or without implicationals. best_models = sorted( best_models, key=lambda info: info[1][classifier_lib.MODEL_INFO_SCORE_KEY], reverse=True) best_configs[feature_name]["use_implicationals"] = best_models[0][0] best_configs[feature_name]["model"] = best_models[0][1] return best_configs
def _prepare_data_worker(self, feature_name): """Prepares data for individual feature. Decision to use or ignore the implicationals is taken based on cross-validation configuration. Returns a set of input features for prediction and corresponding language codes. Args: feature_name: (string) Name of the feature. Returns: A triple consisting of name of the WALS feature (string), the evalation (or test) input features for the classifier (numpy array) and a list of language codes (WALS codes), where each code corresponds to a single row in eval feature data. """ use_implicationals = True if feature_name in self._configs: use_implicationals = self._configs[feature_name][ "use_implicationals"] _, _, X_dev, _, eval_language_codes, _ = classifier_lib.prepare_data( self._feature_maker, feature_name, use_implicationals=use_implicationals, prediction_mode=self._prediction_mode) if X_dev.shape[0] != len(eval_language_codes): raise ValueError( "Number of eval examples (%d) mismatches number of " "languages (%d)!" % (X_dev.shape[0], len(eval_language_codes))) return feature_name, X_dev, eval_language_codes
def _train_and_evaluate(feature_maker, feature_names): """Train and evaluate a particular feature. Please note: This mode is more suitable for proper evaluation rather than a lengthy cross-validation-based training. Args: feature_maker: (object) Feature builder. feature_names: (list) List of WALS feature names (strings). """ for feature_name in feature_names: try: # Process training and dev data for the feature. X_train, y_train, X_dev, y_dev, _, _ = classifier_lib.prepare_data( feature_maker, feature_name, use_implicationals=FLAGS.use_implicationals) # Train and evaluate models. best_acc = 0 best_classifier = "" for classifier_name in FLAGS.classifiers: acc = _train_and_evaluate_model(feature_name, classifier_name, X_train, y_train, X_dev, y_dev) if acc > best_acc: best_acc = acc best_classifier = classifier_name print("=== [{}] {}: Dev set: Best Accuracy {}".format( feature_name, best_classifier, best_acc)) except Exception: # pylint: disable=broad-except if not FLAGS.catch_exceptions: raise logging.warning("Problem with processing feature: %s", feature_name)
def _train_model_worker(self, feature_name): """Train individual classifier in a single thread.""" model_is_reliable = True if not FLAGS.force_classifier: # Select classifiers from the best configuration. model_name = _DEFAULT_CLASSIFIER_NAME if feature_name in self._configs: assert "model" in self._configs[feature_name] model_config = self._configs[feature_name]["model"] model_name = model_config[classifier_lib.MODEL_INFO_NAME_KEY] should_ignore = model_config[ classifier_lib.MODEL_INFO_SPARSITY_KEY] score = model_config[classifier_lib.MODEL_INFO_SCORE_KEY] if should_ignore or score < _BAD_ACCURACY_THRESHOLD: # Not enough training data or low CV accuracy score. Fall back to # search-based approaches. logging.warning("[%s] No reliable models found", feature_name) model_is_reliable = False else: # Use single classifier for everything. model_name = FLAGS.force_classifier if model_is_reliable: # Train the model. Please note, the training features have already been # constructed and cached by the feature maker during the data preparation # step preceding the training. logging.info("[%s] %s: \"%s\" ...", self._name, feature_name, model_name) use_implicationals = True if not FLAGS.force_classifier: use_implicationals = self._configs[feature_name][ "use_implicationals"] X_train, y_train, _, _, _, train_class_counts = ( classifier_lib.prepare_data( self._feature_maker, feature_name, use_implicationals=use_implicationals, prediction_mode=self._prediction_mode)) ymax_freq = train_class_counts[0][1] # Highest frequency. model = classifier_lib.train_classifier(feature_name, model_name, X_train, y_train) return feature_name, model, ymax_freq