Ejemplo n.º 1
0
def _preprocess_df_for_training(
        features,
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    """ Method to pre-process dataframes. This doesn't have the optoin to save the
    processed data as hdf5 as we don't expect users to do this as the data can
    be processed in memory
    """

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        data_df = concatenate_df(data_train_df, data_validation_df,
                                 data_test_df)

    data, train_set_metadata = build_dataset_df(data_df,
                                                features,
                                                preprocessing_params,
                                                random_seed=random_seed)
    training_set, test_set, validation_set = split_dataset_tvt(
        data, data['split'])
    return training_set, test_set, validation_set, train_set_metadata
Ejemplo n.º 2
0
def load_data(
        hdf5_file_path,
        input_features,
        output_features,
        split_data=True,
        shuffle_training=False
):
    logger.info('Loading data from: {0}'.format(hdf5_file_path))
    # Load data from file
    hdf5_data = h5py.File(hdf5_file_path, 'r')
    dataset = {}
    for input_feature in input_features:
        if input_feature['type'] == TEXT:
            text_data_field = text_feature_data_field(input_feature)
            dataset[text_data_field] = hdf5_data[text_data_field].value
        else:
            dataset[input_feature['name']] = hdf5_data[
                input_feature['name']
            ].value
    for output_feature in output_features:
        if output_feature['type'] == TEXT:
            dataset[text_feature_data_field(output_feature)] = hdf5_data[
                text_feature_data_field(output_feature)
            ].value
        else:
            dataset[output_feature['name']] = hdf5_data[
                output_feature['name']].value
        if 'limit' in output_feature:
            dataset[output_feature['name']] = collapse_rare_labels(
                dataset[output_feature['name']],
                output_feature['limit']
            )

    if not split_data:
        hdf5_data.close()
        return dataset

    split = hdf5_data['split'].value
    hdf5_data.close()
    training_set, test_set, validation_set = split_dataset_tvt(dataset, split)

    # shuffle up
    if shuffle_training:
        training_set = data_utils.shuffle_dict_unison_inplace(training_set)

    return training_set, test_set, validation_set
Ejemplo n.º 3
0
def obtain_df_splits(data_csv):
    """Split input data csv file in to train, validation and test dataframes.

    :param data_csv: Input data CSV file.
    :return test_df, train_df, val_df: Train, validation and test dataframe
            splits
    """
    data_df = read_csv(data_csv)
    # Obtain data split array mapping data rows to split type
    # 0-train, 1-validation, 2-test
    data_split = get_split(data_df)
    train_split, test_split, val_split = split_dataset_tvt(data_df, data_split)
    # Splits are python dictionaries not dataframes- they need to be converted.
    test_df = pd.DataFrame(test_split)
    train_df = pd.DataFrame(train_split)
    val_df = pd.DataFrame(val_split)
    return test_df, train_df, val_df
Ejemplo n.º 4
0
def _preprocess_csv_for_training(
        features,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed
):
    """
    Method to pre-process csv data
    :param features: list of all features (input + output)
    :param data_csv: path to the csv data
    :param data_train_csv:  training csv data
    :param data_validation_csv: validation csv data
    :param data_test_csv: test csv data
    :param train_set_metadata_json: train set metadata json
    :param skip_save_processed_input: if False, the pre-processed data is saved
    as .hdf5 files in the same location as the csvs with the same names.
    :param preprocessing_params: preprocessing parameters
    :param random_seed: random seed
    :return: training, test, validation datasets, training metadata
    """
    train_set_metadata = None
    if train_set_metadata_json is not None:
        train_set_metadata = load_metadata(train_set_metadata_json)

    if data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logger.info(
            'Using full raw csv, no hdf5 and json file '
            'with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(
            data_csv,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logger.info('Writing train set metadata with vocabulary')

            train_set_metadata_json_fp = replace_file_extension(
                data_csv,
                'json'
            )
            data_utils.save_json(
                train_set_metadata_json_fp, train_set_metadata)

        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logger.info(
            'Using training raw csv, no hdf5 and json '
            'file with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(
            data_train_csv,
            data_validation_csv,
            data_test_csv
        )
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(
            concatenated_df,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
            data_utils.save_hdf5(
                data_train_hdf5_fp,
                training_set,
                train_set_metadata
            )
            if validation_set is not None:
                data_validation_hdf5_fp = replace_file_extension(
                    data_validation_csv,
                    'hdf5'
                )
                data_utils.save_hdf5(
                    data_validation_hdf5_fp,
                    validation_set,
                    train_set_metadata
                )
            if test_set is not None:
                data_test_hdf5_fp = replace_file_extension(data_test_csv,
                                                           'hdf5')
                data_utils.save_hdf5(
                    data_test_hdf5_fp,
                    test_set,
                    train_set_metadata
                )
            logger.info('Writing train set metadata with vocabulary')
            train_set_metadata_json_fp = replace_file_extension(data_train_csv,
                                                                'json')
            data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    return training_set, test_set, validation_set, train_set_metadata
Ejemplo n.º 5
0
def preprocess_for_prediction(
        model_path,
        split,
        data_csv=None,
        data_hdf5=None,
        train_set_metadata=None,
        evaluate_performance=True
):
    """Preprocesses the dataset to parse it into a format that is usable by the
    Ludwig core
        :param model_path: The input data that is joined with the model
               hyperparameter file to create the model definition file
        :type model_path: Str
        :param split: Splits the data into the train and test sets
        :param data_csv: The CSV input data file
        :param data_hdf5: The hdf5 data file if there is no csv data file
        :param train_set_metadata: Train set metadata for the input features
        :param evaluate_performance: If False does not load output features
        :returns: Dataset, Train set metadata
        """
    model_definition = load_json(
        os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)
    )
    for input_feature in model_definition['input_features']:
        if 'preprocessing' in input_feature:
            if 'in_memory' in input_feature['preprocessing']:
                if not input_feature['preprocessing']['in_memory']:
                    logger.warning(
                        'WARNING: When running predict in_memory flag should '
                        'be true. Overriding and setting it to true for '
                        'feature <{}>'.format(input_feature['name'])
                    )
                    input_feature['preprocessing']['in_memory'] = True
    preprocessing_params = merge_dict(
        default_preprocessing_parameters,
        model_definition['preprocessing']
    )
    output_features = model_definition[
        'output_features'] if evaluate_performance else []
    features = model_definition['input_features'] + output_features

    # Check if hdf5 file already exists
    if data_csv is not None:
        data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
        if os.path.isfile(data_hdf5_fp):
            logger.info('Found hdf5 with the same filename of the csv, '
                        'using it instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
    else:
        data_hdf5_fp = None

    # Load data
    train_set_metadata = load_metadata(train_set_metadata)
    if split == FULL:
        if data_hdf5 is not None:
            dataset = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                split_data=False, shuffle_training=False
            )
        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
    else:
        if data_hdf5 is not None:
            training_set, test_set, validation_set = load_data(
                data_hdf5,
                model_definition['input_features'],
                output_features,
                shuffle_training=False
            )

            if split == TRAINING:
                dataset = training_set
            elif split == VALIDATION:
                dataset = validation_set
            else:  # if split == TEST:
                dataset = test_set

        else:
            dataset, train_set_metadata = build_dataset(
                data_csv,
                features,
                preprocessing_params,
                train_set_metadata=train_set_metadata
            )
            # build_dataset adds a split column if there is none in the csv
            # so if we want to check if the csv contained a split column
            # we have to check in the csv not in the built dataset.
            # The logic is that if there is no split in the original csv
            # we treat the split parameter as if it was == full
            if csv_contains_column(data_csv, SPLIT):
                training_set, test_set, validation_set = split_dataset_tvt(
                    dataset,
                    dataset[SPLIT]
                )
                if split == TRAINING:
                    dataset = training_set
                elif split == VALIDATION:
                    dataset = validation_set
                else:  # if split == TEST:
                    dataset = test_set
            else:
                logger.warning(
                    'You requested the {} split, but the data CSV '
                    'does not contain a "split" column, so the '
                    'full data will be used instead'
                )

    replace_text_feature_level(
        features,
        [dataset]
    )

    dataset = Dataset(
        dataset,
        model_definition['input_features'],
        output_features,
        train_set_metadata.get(DATA_TRAIN_HDF5_FP)
    )

    return dataset, train_set_metadata
Ejemplo n.º 6
0
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Check if hdf5 and json already exist
    data_hdf5_fp = None
    data_train_hdf5_fp = None
    data_validation_hdf5_fp = None
    data_test_hdf5_fp = None
    train_set_metadata_json_fp = 'metadata.json'
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json'
        if (os.path.isfile(data_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(
            data_train_csv)[0] + '.json'
        if (os.path.isfile(data_train_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = os.path.splitext(
            data_validation_csv)[0] + '.hdf5'
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5'
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset_df(data_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_df(data_train_df, data_validation_df,
                                         data_test_df)
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    elif data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logging.info('Using full raw csv, no hdf5 and json file '
                     'with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(data_csv,
                                                 features,
                                                 preprocessing_params,
                                                 random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logging.info('Using training raw csv, no hdf5 and json '
                     'file with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(data_train_csv, data_validation_csv,
                                          data_test_csv)
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(model_definition,
                               [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
Ejemplo n.º 7
0
def test_model_save_reload_API(csv_filename, tmp_path):
    tf.random.set_seed(1234)

    dir_path = os.path.dirname(csv_filename)
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3,
                     encoder='rnn',
                     cell_type='lstm',
                     num_layers=2,
                     bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder='stacked_cnn'),
        timeseries_feature(encoder='parallel_cnn'),
        sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }

    data_df = read_csv(data_csv_path)
    training_set, test_set, validation_set = split_dataset_tvt(
        data_df, get_split(data_df))
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # perform initial model training
    ludwig_model1 = LudwigModel(model_definition)
    train_stats = ludwig_model1.train(
        data_train_df=training_set,
        data_validation_df=validation_set,
        data_test_df=test_set,
        output_directory='results'  # results_dir
    )

    preds_1 = ludwig_model1.predict(data_df=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2 = ludwig_model2.predict(data_df=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert list(preds_1[key]) == list(preds_2[key]), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.model.input_features:
            if1 = ludwig_model1.model.model.input_features[if_name]
            if2 = ludwig_model2.model.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.weights,
                                    if2.encoder_obj.weights):
                assert np.allclose(if1_w.numpy(), if2_w.numpy())

        c1 = ludwig_model1.model.model.combiner
        c2 = ludwig_model2.model.model.combiner
        for c1_w, c2_w in zip(c1.weights, c2.weights):
            assert np.allclose(c1_w.numpy(), c2_w.numpy())

        for of_name in ludwig_model1.model.model.output_features:
            of1 = ludwig_model1.model.model.output_features[of_name]
            of2 = ludwig_model2.model.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.weights,
                                    of2.decoder_obj.weights):
                assert np.allclose(of1_w.numpy(), of2_w.numpy())

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(
        os.path.join(ludwig_model1.exp_dir_name, 'model'))
    check_model_equal(ludwig_model_exp)