def __init__(self, training_data, dev_data, data_info_file):
        self._genus_values = collections.defaultdict(
            lambda: collections.defaultdict(tuple))
        self._family_values = collections.defaultdict(
            lambda: collections.defaultdict(tuple))
        self._neighborhood_values = collections.defaultdict(
            lambda: collections.defaultdict(tuple))
        self._implicational_values = collections.defaultdict(
            lambda: collections.defaultdict(tuple))
        logging.info("Reading data info from \"%s\" ...", data_info_file)
        self._data_info = data_lib.load_data_info(data_info_file)
        self._columns = []
        logging.info("Reading training_data from \"%s\" ...", training_data)
        self._training_df = pd.read_csv(training_data,
                                        delimiter="|",
                                        encoding=const.ENCODING)
        logging.info("Reading dev data from \"%s\" ...", dev_data)
        self._dev_df = pd.read_csv(dev_data,
                                   delimiter="|",
                                   encoding=const.ENCODING)
        self._per_feature_dfs = {}

        genus_data = _get_associations_path(FLAGS.genus_filename)
        family_data = _get_associations_path(FLAGS.family_filename)
        neighborhood_data = _get_associations_path(FLAGS.neighborhood_filename)
        implicational_data = _get_associations_path(
            FLAGS.implicational_filename)
        self._load_stats(genus_data, family_data, neighborhood_data,
                         implicational_data)
Ejemplo n.º 2
0
    def init(self, training_data_dir, train_set_name, dev_set_name,
             features_to_predict):
        """Initializes the model."""
        # Load the training set and the data info. Make sure all features (but not
        # necessarily their values) we predict are present in the data info mapping.
        self._df = basic_models.load_training_data(self._name,
                                                   training_data_dir,
                                                   train_set_name)
        self._data_info = data_info_lib.load_data_info(
            data_info_lib.data_info_path_for_testing(training_data_dir))
        for feature_name in features_to_predict:
            if feature_name not in self._data_info[const.DATA_KEY_FEATURES]:
                raise ValueError("Feature \"%s\" unseen in training data!" %
                                 feature_name)

        # Load the associations computed from the training data (family, genus,
        # neighborhood and implicationals). We are not using them yet.
        self._feature_maker = basic_models.make_feature_maker(
            self._name, training_data_dir, train_set_name, dev_set_name)

        # Compute majority class stats.
        self._global_majority_class, _, _, _ = basic_models.collect_majority_stats(
            self._name, self._df)

        # Read results of cross-validation.
        if FLAGS.best_configurations_file:
            logging.info(
                "Reading cross-validation configuration from \"%s\" ...",
                FLAGS.best_configurations_file)
            with open(FLAGS.best_configurations_file,
                      "r",
                      encoding=const.ENCODING) as f:
                self._configs = json.load(f)
            logging.info("Read configurations for %d features.",
                         len(self._configs))

        # Prepare the evaluation data for all the features. This also prepares the
        # training data for the next step.
        self._prepare_data(features_to_predict)

        # Train the models. This will use the training data cached by the previous
        # step.
        self._train_models(features_to_predict)
Ejemplo n.º 3
0
def _load_data_info():
    """Loads data info mappings."""
    input_filename = os.path.join(
        FLAGS.input_dir, (const.DATA_INFO_FILENAME + "_" +
                          FLAGS.training_set_name + data_lib.FILE_EXTENSION))
    return data_lib.load_data_info(input_filename)