Beispiel #1
0
def train(config_file, experiment_name):
    "train model"
    output_directory = SCRIPT_DIR / 'output' / 'results'
    model = LudwigModel(config=str(config_file))
    higgs_df = higgs.load().sample(frac=1)  # shuffle data
    print(higgs_df.shape)
    sys.exit()
    higgs_data_file = SCRIPT_DIR / "higgs_small.parquet"
    higgs_data_file = (
        "s3://data-science.s3.liftoff.io/datascience/temp/higgs_small.parquet")
    higgs_ddf = dd.from_pandas(higgs_df, npartitions=30)
    higgs_ddf.to_parquet(higgs_data_file, engine="pyarrow")

    (training_statistics, preprocessed_data, output_directory) = model.train(
        dataset=higgs_data_file,
        data_format="parquet",
        experiment_name=experiment_name,
        skip_save_processed_input=False,
        output_directory=str(output_directory),
    )
    print('training_statistics keys: {}'.format(training_statistics.keys()))
    (
        training_set,
        validation_set,
        test_set,
        training_set_metadata,
    ) = preprocessed_data
    print(f'{training_set.size=}, {validation_set.size=}, {test_set.size=}')
    print('training_set_metadata keys {}'.format(training_set_metadata.keys()))
    print(f'{output_directory=}')
Beispiel #2
0
import logging

from ludwig.api import LudwigModel
from ludwig.datasets import higgs

model = LudwigModel(
    config='medium_config.yaml',
    logging_level=logging.INFO,
)

higgs_df = higgs.load()
model.train(dataset=higgs_df,
            experiment_name='higgs_medium',
            model_name='higgs_tabnet_medium')