コード例 #1
0
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        data_hdf5_fp,
    )

    return dataset, train_set_metadata
コード例 #2
0
 def create_dataset(self, dataset, tag, config, training_set_metadata):
     return Dataset(dataset, get_proc_features(config),
                    training_set_metadata.get(DATA_TRAIN_HDF5_FP))
コード例 #3
0
def preprocess_for_training_by_type(
        model_definition,
        data_type,
        all_data_fp=None,
        train_fp=None,
        validation_fp=None,
        test_fp=None,
        all_data_df=None,
        train_df=None,
        validation_df=None,
        test_df=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed
):
    if all_data_fp is not None and train_fp is not None:
        raise ValueError('Use either one file for all data or 3 files for '
                         'train, test and validation')

    if data_type not in ['hdf5', 'csv', 'pandas']:
        raise ValueError('Invalid type of data provided')

    features = (model_definition['input_features'] +
                model_definition['output_features'])

    data_hdf5_fp = None

    if data_type == 'pandas':
        # Preprocess data frames
        (
            training_set,
            test_set,
            validation_set,
            train_set_metadata
        ) = _preprocess_df_for_training(
            features,
            all_data_df,
            train_df,
            validation_df,
            test_df,
            train_set_metadata_json=train_set_metadata_json,
            preprocessing_params=preprocessing_params,
            random_seed=random_seed
        )
    elif data_type == 'hdf5' and train_set_metadata_json is None:
        raise ValueError('train set metadata file is not found along with hdf5 '
                         'data')
    elif data_type == 'hdf5':
        if all_data_fp is not None:
            data_hdf5_fp = replace_file_extension(all_data_fp, 'hdf5')
            logger.info('Using full hdf5 and json')
            training_set, test_set, validation_set = load_data(
                all_data_fp,
                model_definition['input_features'],
                model_definition['output_features'],
                shuffle_training=True
            )
            train_set_metadata = load_metadata(train_set_metadata_json)
        elif train_fp is not None:
            logger.info('Using hdf5 and json')
            training_set = load_data(
                train_fp,
                model_definition['input_features'],
                model_definition['output_features'],
                split_data=False
            )
            train_set_metadata = load_metadata(train_set_metadata_json)

            validation_set = None
            if validation_fp is not None:
                validation_set = load_data(
                    validation_fp,
                    model_definition['input_features'],
                    model_definition['output_features'],
                    split_data=False
                )

            test_set = None
            if test_fp is not None:
                test_set = load_data(
                    test_fp,
                    model_definition['input_features'],
                    model_definition['output_features'],
                    split_data=False
                )

    elif data_type == 'csv':
        data_hdf5_fp = replace_file_extension(
            all_data_fp, 'hdf5'
        )
        model_definition['data_hdf5_fp'] = data_hdf5_fp

        if all_data_fp is not None:
            if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and
                    file_exists_with_diff_extension(all_data_fp, 'json')):
                # use hdf5 data instead
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csv, using them instead'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    all_data_fp=replace_file_extension(all_data_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(all_data_fp,
                                                                   'json'),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
                    data_csv=all_data_fp,
                    data_train_csv=None,
                    data_validation_csv=None,
                    data_test_csv=None,
                    train_set_metadata_json=train_set_metadata_json,
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
        else:
            if (file_exists_with_diff_extension(train_fp, 'hdf5') and
                    file_exists_with_diff_extension(train_fp, 'json') and
                    file_exists_with_diff_extension(validation_fp, 'hdf5') and
                    file_exists_with_diff_extension(test_fp, 'hdf5')):
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csvs, using them instead.'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    train_fp=replace_file_extension(train_fp, 'hdf5'),
                    validation_fp=replace_file_extension(validation_fp, 'hdf5'),
                    test_fp=replace_file_extension(test_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(all_data_fp,
                                                                   'json'),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
                    data_csv=None,
                    data_train_csv=train_fp,
                    data_validation_csv=validation_fp,
                    data_test_csv=test_fp,
                    train_set_metadata_json=train_set_metadata_json,
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(
        model_definition['input_features'] +
        model_definition['output_features'],
        [training_set, validation_set, test_set]
    )

    training_dataset = Dataset(
        training_set,
        model_definition['input_features'],
        model_definition['output_features'],
        data_hdf5_fp
    )

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(
            validation_set,
            model_definition['input_features'],
            model_definition['output_features'],
            data_hdf5_fp
        )

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(
            test_set,
            model_definition['input_features'],
            model_definition['output_features'],
            data_hdf5_fp
        )

    return (
        training_dataset,
        validation_dataset,
        test_dataset,
        train_set_metadata
    )
コード例 #4
0
    def train_online(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        batch_size=None,
        learning_rate=None,
        regularization_lambda=None,
        dropout_rate=None,
        bucketing_field=None,
        gpus=None,
        gpu_fraction=1,
        logging_level=logging.ERROR,
    ):
        """This function is used to perform one epoch of training of the model 
        on the specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data.
        :param data_csv: (string) input data CSV file.
        :param data_dict: (dict) input data dictionary. It is expected to 
               contain one key for each field and the values have to be lists of 
               the same length. Each index in the lists corresponds to one 
               datapoint. For example a data set consisting of two datapoints 
               with a text and a class may be provided as the following dict 
               ``{'text_field_name': ['text of the first datapoint', text of the
               second datapoint'], 'class_filed_name': ['class_datapoints_1', 
               'class_datapoints_2']}`.
        :param batch_size: (int) the batch size to use for training. By default 
               it's the one specified in the model definition.
        :param learning_rate: (float) the learning rate to use for training. By
               default the values is the one specified in the model definition.
        :param regularization_lambda: (float) the regularization lambda
               parameter to use for training. By default the values is the one
               specified in the model definition.
        :param dropout_rate: (float) the dropout rate to use for training. By
               default the values is the one specified in the model definition.
        :param bucketing_field: (string) the bucketing field to use for
               bucketing the data. By default the values is one specified in the
               model definition.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_fraction: (float, default `1.0`) fraction of GPU memory to
               initialize the process with
        :param logging_level: (int, default: `logging.ERROR`) logging level to
               use for logging. Use logging constants like `logging.DEBUG`,
               `logging.INFO` and `logging.ERROR`. By default only errors will
               be printed.

        There are three ways to provide data: by dataframes using the `data_df`
        parameter, by CSV using the `data_csv` parameter and by dictionary,
        using the `data_dict` parameter.

        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while dict
        approach uses data organized by keys representing columns and values
        that are lists of the datapoints for each. For example a data set
        consisting of two datapoints with a text and a class may be provided as
        the following dict ``{'text_field_name}: ['text of the first datapoint',
        text of the second datapoint'], 'class_filed_name':
        ['class_datapoints_1', 'class_datapoints_2']}`.
        """
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been initialized or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)
            data_df.csv = data_csv

        if batch_size is None:
            batch_size = self.model_definition['training']['batch_size']
        if learning_rate is None:
            learning_rate = self.model_definition['training']['learning_rate']
        if regularization_lambda is None:
            regularization_lambda = self.model_definition['training'][
                'regularization_lambda']
        if dropout_rate is None:
            dropout_rate = self.model_definition['training']['dropout_rate'],
        if bucketing_field is None:
            bucketing_field = self.model_definition['training'][
                'bucketing_field']

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = (self.model_definition['input_features'] +
                            self.model_definition['output_features'])
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            self.model_definition['output_features'], [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          self.model_definition['output_features'], None)

        logging.debug('Training batch')
        self.model.train_online(dataset,
                                batch_size=batch_size,
                                learning_rate=learning_rate,
                                regularization_lambda=regularization_lambda,
                                dropout_rate=dropout_rate,
                                bucketing_field=bucketing_field,
                                gpus=gpus,
                                gpu_fraction=gpu_fraction)
コード例 #5
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        only_predictions=True,
        logging_level=logging.ERROR,
    ):
        logging.getLogger().setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logging.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = self.model_definition['input_features']
        if not only_predictions:
            features_to_load += self.model_definition['output_features']
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            ([] if only_predictions else
             self.model_definition['output_features']), [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          [] if only_predictions else
                          self.model_definition['output_features'], None)

        logging.debug('Predicting')
        predict_results = self.model.predict(dataset,
                                             batch_size,
                                             only_predictions=only_predictions,
                                             gpus=gpus,
                                             gpu_fraction=gpu_fraction,
                                             session=getattr(
                                                 self.model, 'session', None))

        if not only_predictions:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logging.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logging.warning('Unrecognized return_type: {}. '
                            'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results
コード例 #6
0
ファイル: preprocessing.py プロジェクト: magiciiboy/ludwig
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Sanity Check to make sure some data source is provided
    data_sources_provided = [
        data_df, data_train_df, data_csv, data_train_csv, data_hdf5,
        data_train_hdf5
    ]
    data_sources_not_none = [x is not None for x in data_sources_provided]
    if not any(data_sources_not_none):
        raise ValueError('No training data is provided!')

    # Check if hdf5 and json already exist. If they do, use the hdf5 data,
    # instead of the csvs
    data_hdf5_fp = None

    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        train_set_metadata_json_fp = replace_file_extension(data_csv, 'json')
        if os.path.isfile(data_hdf5_fp) and os.path.isfile(
                train_set_metadata_json_fp):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
        train_set_metadata_json_fp = replace_file_extension(
            data_train_csv,
            'json',
        )

        if os.path.isfile(data_train_hdf5_fp) and os.path.isfile(
                train_set_metadata_json_fp):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = replace_file_extension(
            data_validation_csv, 'hdf5')
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5')
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the test csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None or data_train_df is not None:
        # Preprocess data frames
        (training_set, test_set, validation_set,
         train_set_metadata) = _preprocess_df_for_training(
             features, data_df, data_train_df, data_validation_df,
             data_test_df, preprocessing_params, random_seed)
    elif data_csv is not None or data_train_csv is not None:
        # Preprocess csv data
        (training_set, test_set, validation_set,
         train_set_metadata) = _preprocess_csv_for_training(
             features, data_csv, data_train_csv, data_validation_csv,
             data_test_csv, skip_save_processed_input, preprocessing_params,
             random_seed)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # does not need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # does not need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(
        model_definition['input_features'] +
        model_definition['output_features'],
        [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
コード例 #7
0
ファイル: api.py プロジェクト: smashpumpkin/ludwig
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        evaluate_performance=False,
        skip_save_unprocessed_output=False,
        gpus=None,
        gpu_fraction=1,
    ):

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        num_overrides = override_in_memory_flag(
            self.model_definition['input_features'], True)
        if num_overrides > 0:
            logger.warning(
                'Using in_memory = False is not supported for Ludwig API.')

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results,
                self.model_definition['output_features'],
                self.train_set_metadata,
                experiment_dir_name=self.exp_dir_name,
                skip_save_unprocessed_output=skip_save_unprocessed_output,
            )

        return postprocessed_predictions, predict_results
コード例 #8
0
ファイル: preprocessing.py プロジェクト: sree181/ludwig
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == FULL:
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training_set, test_set, validation_set = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == TRAINING:
                dataset = training_set
            elif split == VALIDATION:
                dataset = validation_set
            else:  # if split == TEST:
                dataset = test_set

        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
            # build_dataset adds a split column if there is none in the csv
            # so if we want to check if the csv contained a split column
            # we have to check in the csv not in the built dataset.
            # The logic is that if there is no split in the original csv
            # we treat the split parameter as if it was == full
            if csv_contains_column(data_csv, SPLIT):
                training_set, test_set, validation_set = split_dataset_tvt(
                    dataset,
                    dataset[SPLIT]
                )
                if split == TRAINING:
                    dataset = training_set
                elif split == VALIDATION:
                    dataset = validation_set
                else:  # if split == TEST:
                    dataset = test_set
            else:
                logger.warning(
                    'You requested the {} split, but the data CSV '
                    'does not contain a "split" column, so the '
                    'full data will be used instead'
                )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        train_set_metadata.get(DATA_TRAIN_HDF5_FP)
    )

    return dataset, train_set_metadata
コード例 #9
0
def preprocess_for_prediction(model_path,
                              split,
                              dataset_type='generic',
                              data_csv=None,
                              data_hdf5=None,
                              train_set_metadata=None,
                              only_predictions=False):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param dataset_type: Generic
        :type: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param only_predictions: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME))
    preprocessing_params = merge_dict(default_preprocessing_parameters,
                                      model_definition['preprocessing'])

    # Check if hdf5 and json already exist
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        if os.path.isfile(data_hdf5_fp):
            logging.info(
                'Found hdf5 with the same filename of the csv, using it instead'
            )
            data_csv = None
            data_hdf5 = data_hdf5_fp

    # Load data
    _, _, build_dataset, _ = get_dataset_fun(dataset_type)
    train_set_metadata = load_metadata(train_set_metadata)
    features = (
        model_definition['input_features'] +
        ([] if only_predictions else model_definition['output_features']))
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(data_hdf5,
                                model_definition['input_features'],
                                [] if only_predictions else
                                model_definition['output_features'],
                                split_data=False,
                                shuffle_training=False)
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'], []
                if only_predictions else model_definition['output_features'],
                shuffle_training=False)

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata)

    replace_text_feature_level(model_definition, [dataset])

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        [] if only_predictions else model_definition['output_features'],
        data_hdf5,
    )

    return dataset, train_set_metadata
コード例 #10
0
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Check if hdf5 and json already exist
    data_hdf5_fp = None
    data_train_hdf5_fp = None
    data_validation_hdf5_fp = None
    data_test_hdf5_fp = None
    train_set_metadata_json_fp = 'metadata.json'
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json'
        if (os.path.isfile(data_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(
            data_train_csv)[0] + '.json'
        if (os.path.isfile(data_train_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = os.path.splitext(
            data_validation_csv)[0] + '.hdf5'
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5'
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset_df(data_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_df(data_train_df, data_validation_df,
                                         data_test_df)
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    elif data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logging.info('Using full raw csv, no hdf5 and json file '
                     'with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(data_csv,
                                                 features,
                                                 preprocessing_params,
                                                 random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logging.info('Using training raw csv, no hdf5 and json '
                     'file with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(data_train_csv, data_validation_csv,
                                          data_test_csv)
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(model_definition,
                               [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
コード例 #11
0
ファイル: api.py プロジェクト: wbeater/ludwig
    def train_online(self,
                     data_df=None,
                     data_csv=None,
                     data_dict=None,
                     batch_size=None,
                     learning_rate=None,
                     regularization_lambda=None,
                     bucketing_field=None):
        """This function is used to perform one epoch of training of the model
        on the specified dataset.

        # Inputs

        :param data_df: (DataFrame) dataframe containing data.
        :param data_csv: (string) input data CSV file.
        :param data_dict: (dict) input data dictionary. It is expected to
               contain one key for each field and the values have to be lists of
               the same length. Each index in the lists corresponds to one
               datapoint. For example a data set consisting of two datapoints
               with a text and a class may be provided as the following dict
               ``{'text_field_name': ['text of the first datapoint', text of the
               second datapoint'], 'class_filed_name': ['class_datapoints_1',
               'class_datapoints_2']}`.
        :param batch_size: (int) the batch size to use for training. By default
               it's the one specified in the model definition.
        :param learning_rate: (float) the learning rate to use for training. By
               default the values is the one specified in the model definition.
        :param regularization_lambda: (float) the regularization lambda
               parameter to use for training. By default the values is the one
               specified in the model definition.
        :param dropout: (float) the dropout rate to use for training. By
               default the values is the one specified in the model definition.
        :param bucketing_field: (string) the bucketing field to use for
               bucketing the data. By default the values is one specified in the
               model definition.

        There are three ways to provide data: by dataframes using the `data_df`
        parameter, by CSV using the `data_csv` parameter and by dictionary,
        using the `data_dict` parameter.

        The DataFrame approach uses data previously obtained and put in a
        dataframe, the CSV approach loads data from a CSV file, while dict
        approach uses data organized by keys representing columns and values
        that are lists of the datapoints for each. For example a data set
        consisting of two datapoints with a text and a class may be provided as
        the following dict ``{'text_field_name}: ['text of the first datapoint',
        text of the second datapoint'], 'class_filed_name':
        ['class_datapoints_1', 'class_datapoints_2']}`.
        """

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been initialized or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)
            data_df.csv = data_csv

        if batch_size is None:
            batch_size = self.model_definition[TRAINING]['batch_size']
        if learning_rate is None:
            learning_rate = self.model_definition[TRAINING]['learning_rate']
        if regularization_lambda is None:
            regularization_lambda = self.model_definition[TRAINING][
                'regularization_lambda']
        if bucketing_field is None:
            bucketing_field = self.model_definition[TRAINING][
                'bucketing_field']

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        features_to_load = (self.model_definition['input_features'] +
                            self.model_definition['output_features'])
        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(
            self.model_definition['input_features'] +
            self.model_definition['output_features'], [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          self.model_definition['output_features'], None)

        logger.debug('Training batch')
        self.model.train_online(dataset,
                                batch_size=batch_size,
                                learning_rate=learning_rate,
                                regularization_lambda=regularization_lambda,
                                bucketing_field=bucketing_field)
コード例 #12
0
    def _predict(
        self,
        data_df=None,
        data_csv=None,
        data_dict=None,
        return_type=pd.DataFrame,
        batch_size=128,
        gpus=None,
        gpu_fraction=1,
        evaluate_performance=False,
        logging_level=logging.ERROR,
    ):
        logging.getLogger('ludwig').setLevel(logging_level)
        if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}:
            set_disable_progressbar(True)

        if (self.model is None or self.model_definition is None
                or self.train_set_metadata is None):
            raise ValueError('Model has not been trained or loaded')

        if data_df is None:
            data_df = self._read_data(data_csv, data_dict)

        logger.debug('Preprocessing {} datapoints'.format(len(data_df)))
        # Added [:] to next line, before I was just assigning,
        # this way I'm copying the list. If you don't do it, you are actually
        # modifying the input feature list when you add output features,
        # which you definitely don't want to do
        features_to_load = self.model_definition['input_features'][:]
        if evaluate_performance:
            output_features = self.model_definition['output_features']
        else:
            output_features = []
        features_to_load += output_features

        preprocessed_data = build_data(data_df, features_to_load,
                                       self.train_set_metadata,
                                       self.model_definition['preprocessing'])
        replace_text_feature_level(features_to_load, [preprocessed_data])
        dataset = Dataset(preprocessed_data,
                          self.model_definition['input_features'],
                          output_features, None)

        logger.debug('Predicting')
        predict_results = self.model.predict(
            dataset,
            batch_size,
            evaluate_performance=evaluate_performance,
            gpus=gpus,
            gpu_fraction=gpu_fraction,
            session=getattr(self.model, 'session', None))

        if evaluate_performance:
            calculate_overall_stats(predict_results,
                                    self.model_definition['output_features'],
                                    dataset, self.train_set_metadata)

        logger.debug('Postprocessing')
        if (return_type == 'dict' or return_type == 'dictionary'
                or return_type == dict):
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        elif (return_type == 'dataframe' or return_type == 'df'
              or return_type == pd.DataFrame):
            postprocessed_predictions = postprocess_df(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)
        else:
            logger.warning('Unrecognized return_type: {}. '
                           'Returning DataFrame.'.format(return_type))
            postprocessed_predictions = postprocess(
                predict_results, self.model_definition['output_features'],
                self.train_set_metadata)

        return postprocessed_predictions, predict_results