Exemple #1
0
def delete_temporary_data(csv_path):
    """
    Helper method to delete temporary data created for running tests. Deletes
    the csv and hdf5/json data (if any)
    :param csv_path: path to the csv data file
    :return: None
    """
    if os.path.isfile(csv_path):
        os.remove(csv_path)

    json_path = replace_file_extension(csv_path, 'meta.json')
    if os.path.isfile(json_path):
        os.remove(json_path)

    hdf5_path = replace_file_extension(csv_path, 'hdf5')
    if os.path.isfile(hdf5_path):
        os.remove(hdf5_path)
Exemple #2
0
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == 'full':
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training, test, validation = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == 'training':
                dataset = training
            elif split == 'validation':
                dataset = validation
            else:  # if split == 'test':
                dataset = test
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        data_hdf5_fp,
    )

    return dataset, train_set_metadata
Exemple #3
0
def _preprocess_csv_for_training(
        features,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed
):
    """
    Method to pre-process csv data
    :param features: list of all features (input + output)
    :param data_csv: path to the csv data
    :param data_train_csv:  training csv data
    :param data_validation_csv: validation csv data
    :param data_test_csv: test csv data
    :param train_set_metadata_json: train set metadata json
    :param skip_save_processed_input: if False, the pre-processed data is saved
    as .hdf5 files in the same location as the csvs with the same names.
    :param preprocessing_params: preprocessing parameters
    :param random_seed: random seed
    :return: training, test, validation datasets, training metadata
    """
    train_set_metadata = None
    if train_set_metadata_json is not None:
        train_set_metadata = load_metadata(train_set_metadata_json)

    if data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logger.info(
            'Using full raw csv, no hdf5 and json file '
            'with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(
            data_csv,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logger.info('Writing train set metadata with vocabulary')

            train_set_metadata_json_fp = replace_file_extension(
                data_csv,
                'json'
            )
            data_utils.save_json(
                train_set_metadata_json_fp, train_set_metadata)

        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logger.info(
            'Using training raw csv, no hdf5 and json '
            'file with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(
            data_train_csv,
            data_validation_csv,
            data_test_csv
        )
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(
            concatenated_df,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
            data_utils.save_hdf5(
                data_train_hdf5_fp,
                training_set,
                train_set_metadata
            )
            if validation_set is not None:
                data_validation_hdf5_fp = replace_file_extension(
                    data_validation_csv,
                    'hdf5'
                )
                data_utils.save_hdf5(
                    data_validation_hdf5_fp,
                    validation_set,
                    train_set_metadata
                )
            if test_set is not None:
                data_test_hdf5_fp = replace_file_extension(data_test_csv,
                                                           'hdf5')
                data_utils.save_hdf5(
                    data_test_hdf5_fp,
                    test_set,
                    train_set_metadata
                )
            logger.info('Writing train set metadata with vocabulary')
            train_set_metadata_json_fp = replace_file_extension(data_train_csv,
                                                                'json')
            data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    return training_set, test_set, validation_set, train_set_metadata
Exemple #4
0
def preprocess_for_training_by_type(
        model_definition,
        data_type,
        all_data_fp=None,
        train_fp=None,
        validation_fp=None,
        test_fp=None,
        all_data_df=None,
        train_df=None,
        validation_df=None,
        test_df=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed
):
    if all_data_fp is not None and train_fp is not None:
        raise ValueError('Use either one file for all data or 3 files for '
                         'train, test and validation')

    if data_type not in ['hdf5', 'csv', 'pandas']:
        raise ValueError('Invalid type of data provided')

    features = (model_definition['input_features'] +
                model_definition['output_features'])

    data_hdf5_fp = None

    if data_type == 'pandas':
        # Preprocess data frames
        (
            training_set,
            test_set,
            validation_set,
            train_set_metadata
        ) = _preprocess_df_for_training(
            features,
            all_data_df,
            train_df,
            validation_df,
            test_df,
            train_set_metadata_json=train_set_metadata_json,
            preprocessing_params=preprocessing_params,
            random_seed=random_seed
        )
    elif data_type == 'hdf5' and train_set_metadata_json is None:
        raise ValueError('train set metadata file is not found along with hdf5 '
                         'data')
    elif data_type == 'hdf5':
        if all_data_fp is not None:
            data_hdf5_fp = replace_file_extension(all_data_fp, 'hdf5')
            logger.info('Using full hdf5 and json')
            training_set, test_set, validation_set = load_data(
                all_data_fp,
                model_definition['input_features'],
                model_definition['output_features'],
                shuffle_training=True
            )
            train_set_metadata = load_metadata(train_set_metadata_json)
        elif train_fp is not None:
            logger.info('Using hdf5 and json')
            training_set = load_data(
                train_fp,
                model_definition['input_features'],
                model_definition['output_features'],
                split_data=False
            )
            train_set_metadata = load_metadata(train_set_metadata_json)

            validation_set = None
            if validation_fp is not None:
                validation_set = load_data(
                    validation_fp,
                    model_definition['input_features'],
                    model_definition['output_features'],
                    split_data=False
                )

            test_set = None
            if test_fp is not None:
                test_set = load_data(
                    test_fp,
                    model_definition['input_features'],
                    model_definition['output_features'],
                    split_data=False
                )

    elif data_type == 'csv':
        data_hdf5_fp = replace_file_extension(
            all_data_fp, 'hdf5'
        )
        model_definition['data_hdf5_fp'] = data_hdf5_fp

        if all_data_fp is not None:
            if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and
                    file_exists_with_diff_extension(all_data_fp, 'json')):
                # use hdf5 data instead
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csv, using them instead'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    all_data_fp=replace_file_extension(all_data_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(all_data_fp,
                                                                   'json'),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
                    data_csv=all_data_fp,
                    data_train_csv=None,
                    data_validation_csv=None,
                    data_test_csv=None,
                    train_set_metadata_json=train_set_metadata_json,
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
        else:
            if (file_exists_with_diff_extension(train_fp, 'hdf5') and
                    file_exists_with_diff_extension(train_fp, 'json') and
                    file_exists_with_diff_extension(validation_fp, 'hdf5') and
                    file_exists_with_diff_extension(test_fp, 'hdf5')):
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csvs, using them instead.'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    train_fp=replace_file_extension(train_fp, 'hdf5'),
                    validation_fp=replace_file_extension(validation_fp, 'hdf5'),
                    test_fp=replace_file_extension(test_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(all_data_fp,
                                                                   'json'),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
                    data_csv=None,
                    data_train_csv=train_fp,
                    data_validation_csv=validation_fp,
                    data_test_csv=test_fp,
                    train_set_metadata_json=train_set_metadata_json,
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(
        model_definition['input_features'] +
        model_definition['output_features'],
        [training_set, validation_set, test_set]
    )

    training_dataset = Dataset(
        training_set,
        model_definition['input_features'],
        model_definition['output_features'],
        data_hdf5_fp
    )

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(
            validation_set,
            model_definition['input_features'],
            model_definition['output_features'],
            data_hdf5_fp
        )

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(
            test_set,
            model_definition['input_features'],
            model_definition['output_features'],
            data_hdf5_fp
        )

    return (
        training_dataset,
        validation_dataset,
        test_dataset,
        train_set_metadata
    )
Exemple #5
0
def create_data_set_to_use(data_format, raw_data):
    # helper function for generating training and test data with specified format
    # handles all data formats except for hdf5
    # assumes raw_data is a csv dataset generated by
    # tests.integration_tests.utils.generate_data() function

    # support for writing to a fwf dataset based on this stackoverflow posting:
    # https://stackoverflow.com/questions/16490261/python-pandas-write-dataframe-to-fixed-width-file-to-fwf
    from tabulate import tabulate

    def to_fwf(df, fname):
        content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain")
        open(fname, "w").write(content)

    pd.DataFrame.to_fwf = to_fwf

    dataset_to_use = None

    if data_format == "csv":
        dataset_to_use = raw_data

    elif data_format in {"df", "dict"}:
        dataset_to_use = pd.read_csv(raw_data)
        if data_format == "dict":
            dataset_to_use = dataset_to_use.to_dict(orient="list")

    elif data_format == "excel":
        dataset_to_use = replace_file_extension(raw_data, "xlsx")
        pd.read_csv(raw_data).to_excel(dataset_to_use, index=False)

    elif data_format == "excel_xls":
        dataset_to_use = replace_file_extension(raw_data, "xls")
        pd.read_csv(raw_data).to_excel(dataset_to_use, index=False)

    elif data_format == "feather":
        dataset_to_use = replace_file_extension(raw_data, "feather")
        pd.read_csv(raw_data).to_feather(dataset_to_use)

    elif data_format == "fwf":
        dataset_to_use = replace_file_extension(raw_data, "fwf")
        pd.read_csv(raw_data).to_fwf(dataset_to_use)

    elif data_format == "html":
        dataset_to_use = replace_file_extension(raw_data, "html")
        pd.read_csv(raw_data).to_html(dataset_to_use, index=False)

    elif data_format == "json":
        dataset_to_use = replace_file_extension(raw_data, "json")
        pd.read_csv(raw_data).to_json(dataset_to_use, orient="records")

    elif data_format == "jsonl":
        dataset_to_use = replace_file_extension(raw_data, "jsonl")
        pd.read_csv(raw_data).to_json(dataset_to_use, orient="records", lines=True)

    elif data_format == "parquet":
        dataset_to_use = replace_file_extension(raw_data, "parquet")
        pd.read_csv(raw_data).to_parquet(dataset_to_use, index=False)

    elif data_format == "pickle":
        dataset_to_use = replace_file_extension(raw_data, "pickle")
        pd.read_csv(raw_data).to_pickle(dataset_to_use)

    elif data_format == "stata":
        dataset_to_use = replace_file_extension(raw_data, "stata")
        pd.read_csv(raw_data).to_stata(dataset_to_use)

    elif data_format == "tsv":
        dataset_to_use = replace_file_extension(raw_data, "tsv")
        pd.read_csv(raw_data).to_csv(dataset_to_use, sep="\t", index=False)

    else:
        ValueError(f"'{data_format}' is an unrecognized data format")

    return dataset_to_use
Exemple #6
0
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Sanity Check to make sure some data source is provided
    data_sources_provided = [
        data_df, data_train_df, data_csv, data_train_csv, data_hdf5,
        data_train_hdf5
    ]
    data_sources_not_none = [x is not None for x in data_sources_provided]
    if not any(data_sources_not_none):
        raise ValueError('No training data is provided!')

    # Check if hdf5 and json already exist. If they do, use the hdf5 data,
    # instead of the csvs
    data_hdf5_fp = None

    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        train_set_metadata_json_fp = replace_file_extension(data_csv, 'json')
        if os.path.isfile(data_hdf5_fp) and os.path.isfile(
                train_set_metadata_json_fp):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
        train_set_metadata_json_fp = replace_file_extension(
            data_train_csv,
            'json',
        )

        if os.path.isfile(data_train_hdf5_fp) and os.path.isfile(
                train_set_metadata_json_fp):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = replace_file_extension(
            data_validation_csv, 'hdf5')
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5')
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the test csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None or data_train_df is not None:
        # Preprocess data frames
        (training_set, test_set, validation_set,
         train_set_metadata) = _preprocess_df_for_training(
             features, data_df, data_train_df, data_validation_df,
             data_test_df, preprocessing_params, random_seed)
    elif data_csv is not None or data_train_csv is not None:
        # Preprocess csv data
        (training_set, test_set, validation_set,
         train_set_metadata) = _preprocess_csv_for_training(
             features, data_csv, data_train_csv, data_validation_csv,
             data_test_csv, skip_save_processed_input, preprocessing_params,
             random_seed)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # does not need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # does not need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(
        model_definition['input_features'] +
        model_definition['output_features'],
        [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
Exemple #7
0
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == FULL:
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training_set, test_set, validation_set = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == TRAINING:
                dataset = training_set
            elif split == VALIDATION:
                dataset = validation_set
            else:  # if split == TEST:
                dataset = test_set

        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
            # build_dataset adds a split column if there is none in the csv
            # so if we want to check if the csv contained a split column
            # we have to check in the csv not in the built dataset.
            # The logic is that if there is no split in the original csv
            # we treat the split parameter as if it was == full
            if csv_contains_column(data_csv, SPLIT):
                training_set, test_set, validation_set = split_dataset_tvt(
                    dataset,
                    dataset[SPLIT]
                )
                if split == TRAINING:
                    dataset = training_set
                elif split == VALIDATION:
                    dataset = validation_set
                else:  # if split == TEST:
                    dataset = test_set
            else:
                logger.warning(
                    'You requested the {} split, but the data CSV '
                    'does not contain a "split" column, so the '
                    'full data will be used instead'
                )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        train_set_metadata.get(DATA_TRAIN_HDF5_FP)
    )

    return dataset, train_set_metadata
Exemple #8
0
def create_data_set_to_use(data_format, raw_data):
    # helper function for generating training and test data with specified format
    # handles all data formats except for hdf5
    # assumes raw_data is a csv dataset generated by
    # tests.integration_tests.utils.generate_data() function

    # support for writing to a fwf dataset based on this stackoverflow posting:
    # https://stackoverflow.com/questions/16490261/python-pandas-write-dataframe-to-fixed-width-file-to-fwf
    from tabulate import tabulate
    def to_fwf(df, fname):
        content = tabulate(df.values.tolist(), list(df.columns),
                           tablefmt="plain")
        open(fname, "w").write(content)

    pd.DataFrame.to_fwf = to_fwf

    dataset_to_use = None

    if data_format == 'csv':
        dataset_to_use = raw_data

    elif data_format in {'df', 'dict'}:
        dataset_to_use = pd.read_csv(raw_data)
        if data_format == 'dict':
            dataset_to_use = dataset_to_use.to_dict(orient='list')

    elif data_format == 'excel':
        dataset_to_use = replace_file_extension(raw_data, 'xlsx')
        pd.read_csv(raw_data).to_excel(
            dataset_to_use,
            index=False
        )

    elif data_format == 'excel_xls':
        dataset_to_use = replace_file_extension(raw_data, 'xls')
        pd.read_csv(raw_data).to_excel(
            dataset_to_use,
            index=False
        )

    elif data_format == 'feather':
        dataset_to_use = replace_file_extension(raw_data, 'feather')
        pd.read_csv(raw_data).to_feather(
            dataset_to_use
        )

    elif data_format == 'fwf':
        dataset_to_use = replace_file_extension(raw_data, 'fwf')
        pd.read_csv(raw_data).to_fwf(
            dataset_to_use
        )

    elif data_format == 'html':
        dataset_to_use = replace_file_extension(raw_data, 'html')
        pd.read_csv(raw_data).to_html(
            dataset_to_use,
            index=False
        )

    elif data_format == 'json':
        dataset_to_use = replace_file_extension(raw_data, 'json')
        pd.read_csv(raw_data).to_json(
            dataset_to_use,
            orient='records'
        )

    elif data_format == 'jsonl':
        dataset_to_use = replace_file_extension(raw_data, 'jsonl')
        pd.read_csv(raw_data).to_json(
            dataset_to_use,
            orient='records',
            lines=True
        )

    elif data_format == 'parquet':
        dataset_to_use = replace_file_extension(raw_data, 'parquet')
        pd.read_csv(raw_data).to_parquet(
            dataset_to_use,
            index=False
        )

    elif data_format == 'pickle':
        dataset_to_use = replace_file_extension(raw_data, 'pickle')
        pd.read_csv(raw_data).to_pickle(
            dataset_to_use
        )

    elif data_format == 'stata':
        dataset_to_use = replace_file_extension(raw_data, 'stata')
        pd.read_csv(raw_data).to_stata(
            dataset_to_use
        )

    elif data_format == 'tsv':
        dataset_to_use = replace_file_extension(raw_data, 'tsv')
        pd.read_csv(raw_data).to_csv(
            dataset_to_use,
            sep='\t',
            index=False
        )

    else:
        ValueError(
            "'{}' is an unrecognized data format".format(data_format)
        )

    return dataset_to_use
def test_cache_checksum(csv_filename, tmp_path):
    # setup for training
    input_features = [category_feature(vocab_size=5)]
    output_features = [category_feature(vocab_size=2, top_k=2)]

    source_dataset = os.path.join(tmp_path, csv_filename)
    source_dataset = generate_data(input_features, output_features, source_dataset)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "preprocessing": {"text": {"most_common_word": 1000}},
        TRAINER: {"epochs": 2},
    }

    backend = LocalTestBackend()
    cache_fname = replace_file_extension(source_dataset, TRAINING_PREPROC_FILE_NAME)

    # conduct initial training
    output_directory = os.path.join(tmp_path, "results")
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    first_training_timestamp = os.path.getmtime(cache_fname)

    # conduct second training, should not force recreating hdf5
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # time stamps should be the same
    assert first_training_timestamp == current_training_timestamp

    # force recreating cache file by changing checksum
    prior_training_timestamp = current_training_timestamp
    config["preprocessing"]["text"]["most_common_word"] = 2000
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamp should differ
    assert prior_training_timestamp < current_training_timestamp

    # force recreating cache by updating modification time of source dataset
    prior_training_timestamp = current_training_timestamp
    os.utime(source_dataset)
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp

    # force change in feature preprocessing
    prior_training_timestamp = current_training_timestamp
    input_features = config["input_features"].copy()
    input_features[0]["preprocessing"] = {"lowercase": True}
    config["input_features"] = input_features
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp

    # force change in features names (and properties)
    prior_training_timestamp = current_training_timestamp
    input_features = [category_feature(vocab_size=5), category_feature()]
    source_dataset = generate_data(input_features, output_features, source_dataset)
    config["input_features"] = input_features
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp

    # force change in Ludwig version
    prior_training_timestamp = current_training_timestamp
    global_vars.LUDWIG_VERSION = "new_version"
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp
def test_cache_checksum(csv_filename, tmp_path):
    # setup for training
    input_features = [category_feature(vocab_size=5)]
    output_features = [category_feature(vocab_size=2)]

    source_dataset = os.path.join(tmp_path, csv_filename)
    source_dataset = generate_data(input_features, output_features,
                                   source_dataset)

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'preprocessing': {'text': {'most_common_word': 1000}},
        'training': {'epochs': 2}
    }

    # conduct initial training
    output_directory = os.path.join(tmp_path, 'results')
    model = LudwigModel(config)
    _, _, train_output_directory1 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    first_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # conduct second training, should not force recreating hdf5
    model = LudwigModel(config)
    _, _, train_output_directory2 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    second_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # time stamps should be the same
    assert first_training_timestamp == second_training_timestamp

    # force recreating cache file by changing checksum
    config['preprocessing']['text']['most_common_word'] = 2000
    model = LudwigModel(config)
    _, _, train_output_directory3 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    third_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # timestamp should differ
    assert first_training_timestamp < third_training_timestamp

    # force recreating cache by updating modification time of source dataset
    os.utime(source_dataset)
    model = LudwigModel(config)
    _, _, train_output_directory4 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    fourth_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # timestamps should be different
    assert third_training_timestamp < fourth_training_timestamp

    # force change in feature preprocessing
    input_features = config['input_features'].copy()
    input_features[0]['preprocessing'] = {'lowercase': True}
    config['input_features'] = input_features
    model = LudwigModel(config)
    _, _, train_output_directory5 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    fifth_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # timestamps should be different
    assert fourth_training_timestamp < fifth_training_timestamp

    # force change in features names (and properties)
    input_features = [category_feature(vocab_size=5), category_feature()]
    source_dataset = generate_data(input_features, output_features,
                                   source_dataset)
    config['input_features'] = input_features
    model = LudwigModel(config)
    _, _, train_output_directory5 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    sixth_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # timestamps should be different
    assert fifth_training_timestamp < sixth_training_timestamp

    # force change in Ludwig version
    global_vars.LUDWIG_VERSION = 'new_version'
    model = LudwigModel(config)
    _, _, train_output_directory5 = \
        model.train(dataset=source_dataset, output_directory=output_directory)
    seventh_training_timestamp = \
        os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))

    # timestamps should be different
    assert sixth_training_timestamp < seventh_training_timestamp
def test_cache_checksum(csv_filename, tmp_path):
    # setup for training
    input_features = [category_feature(vocab_size=5)]
    output_features = [category_feature(vocab_size=2, top_k=2)]

    source_dataset = os.path.join(tmp_path, csv_filename)
    source_dataset = generate_data(input_features, output_features,
                                   source_dataset)

    config = {
        INPUT_FEATURES: input_features,
        OUTPUT_FEATURES: output_features,
        DEFAULTS: {
            CATEGORY: {
                PREPROCESSING: {
                    "fill_value": "<UNKNOWN>"
                }
            }
        },
        TRAINER: {
            EPOCHS: 2
        },
    }

    backend = LocalTestBackend()
    cache_fname = replace_file_extension(source_dataset,
                                         TRAINING_PREPROC_FILE_NAME)

    # conduct initial training
    output_directory = os.path.join(tmp_path, "results")
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    first_training_timestamp = os.path.getmtime(cache_fname)

    # conduct second training, should not force recreating hdf5
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # time stamps should be the same
    assert first_training_timestamp == current_training_timestamp

    # force recreating cache file by changing checksum by updating defaults
    prior_training_timestamp = current_training_timestamp
    config[DEFAULTS][CATEGORY][PREPROCESSING]["fill_value"] = "<EMPTY>"
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamp should differ
    assert prior_training_timestamp < current_training_timestamp

    # force recreating cache by updating modification time of source dataset
    prior_training_timestamp = current_training_timestamp
    os.utime(source_dataset)
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp

    # force change in feature preprocessing
    prior_training_timestamp = current_training_timestamp
    input_features = config[INPUT_FEATURES].copy()
    input_features[0][PREPROCESSING] = {"lowercase": True}
    config[INPUT_FEATURES] = input_features
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp

    # force change in features names (and properties)
    prior_training_timestamp = current_training_timestamp
    input_features = [category_feature(vocab_size=5), category_feature()]
    source_dataset = generate_data(input_features, output_features,
                                   source_dataset)
    config[INPUT_FEATURES] = input_features
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp

    # force change in Ludwig version
    prior_training_timestamp = current_training_timestamp
    global_vars.LUDWIG_VERSION = "new_version"
    model = LudwigModel(config, backend=backend)
    model.train(dataset=source_dataset, output_directory=output_directory)
    current_training_timestamp = os.path.getmtime(cache_fname)

    # timestamps should be different
    assert prior_training_timestamp < current_training_timestamp
Exemple #12
0
import logging

import pandas as pd
import dask.dataframe as dd
from ludwig.api import LudwigModel
from ludwig.utils.data_utils import replace_file_extension

csv_path = "./data/train_combined.csv"

dataset_to_use = replace_file_extension(csv_path, 'parquet')
pd.read_csv(csv_path).to_parquet(
    dataset_to_use,
    index=False
)

# dataset_to_use = dd.read_csv(csv_path)


model = LudwigModel(
    # config='./config/large.yaml',
    config='./config/small.yaml',
    logging_level=logging.INFO,
    backend='ray')

# model = LudwigModel(
#     # config='./config/large.yaml',
#     config='./config/small.yaml',
#     logging_level=logging.INFO)

(
    train_stats,